2013-08-20 14 views
16

Qualcuno conosce una funzione per convertire una rappresentazione testuale di un numero in un numero effettivo, ad es. 'ventimilatrecentocinque' nel 20305. Ho scritto numeri in righe di dataframe e voglio convertirli in numeri.Converti il ​​numero scritto in numero in R

Nel pacchetto qdap, è possibile sostituire numeri rappresentati numerici con le parole (ad esempio, 1001 diviene mille e mille), ma non il contrario:

library(qdap) 
replace_number("I like 346457 ice cream cones.") 
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones." 
+0

@Henk Ho riscritto la tua domanda un po 'per rendere più chiaro che è necessario convertire le parole in numero e non viceversa. –

+2

Penso che la cosa migliore da fare sia sparare alla persona che ha inviato un file con numeri scritti come parole. Ok, seriamente, dubito che ci sia un modo per farlo se non quello di scrivere un algoritmo di parsing piuttosto dettagliato che ha un enorme database di tutte le parole numeriche ('uno', 'due', ... 'cento', 'mille,' '...' googol ') così come una specie di albero-sorter per la precedenza. Ad esempio, nel tuo esempio ci sono due "cento", ma hanno significati diversi basati sulle parole che li seguono in sequenza. –

risposta

14

Ecco un inizio che dovrebbe arrivare a centinaia di migliaia.

word2num <- function(word){ 
    wsplit <- strsplit(tolower(word)," ")[[1]] 
    one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5, 
         six=6, seven=7, eight=8, nine=9) 
    teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, 
        sixteen=16, seventeen=17, eighteen=18, nineteen=19) 
    ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, 
         sixty=60, seventy=70, eighty=80, ninety=90) 
    doubles <- c(teens,ten_digits) 
    out <- 0 
    i <- 1 
    while(i <= length(wsplit)){ 
     j <- 1 
     if(i==1 && wsplit[i]=="hundred") 
      temp <- 100 
     else if(i==1 && wsplit[i]=="thousand") 
      temp <- 1000 
     else if(wsplit[i] %in% names(one_digits)) 
      temp <- as.numeric(one_digits[wsplit[i]]) 
     else if(wsplit[i] %in% names(teens)) 
      temp <- as.numeric(teens[wsplit[i]]) 
     else if(wsplit[i] %in% names(ten_digits)) 
      temp <- (as.numeric(ten_digits[wsplit[i]])) 
     if(i < length(wsplit) && wsplit[i+1]=="hundred"){ 
      if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) 
       out <- out + 100*temp 
      else 
       out <- 100*(out + temp) 
      j <- 2 
     } 
     else if(i < length(wsplit) && wsplit[i+1]=="thousand"){ 
      if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) 
       out <- out + 1000*temp 
      else 
       out <- 1000*(out + temp) 
      j <- 2 
     } 
     else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){ 
      temp <- temp*100 
      out <- out + temp 
     } 
     else{ 
      out <- out + temp 
     } 
     i <- i + j 
    } 
    return(list(word,out)) 
} 

Risultati:

> word2num("fifty seven") 
[[1]] 
[1] "fifty seven" 

[[2]] 
[1] 57 

> word2num("four fifty seven") 
[[1]] 
[1] "four fifty seven" 

[[2]] 
[1] 457 

> word2num("six thousand four fifty seven") 
[[1]] 
[1] "six thousand four fifty seven" 

[[2]] 
[1] 6457 

> word2num("forty six thousand four fifty seven") 
[[1]] 
[1] "forty six thousand four fifty seven" 

[[2]] 
[1] 46457 

> word2num("forty six thousand four hundred fifty seven") 
[[1]] 
[1] "forty six thousand four hundred fifty seven" 

[[2]] 
[1] 46457 

> word2num("three forty six thousand four hundred fifty seven") 
[[1]] 
[1] "three forty six thousand four hundred fifty seven" 

[[2]] 
[1] 346457 

posso dirvi già che questo non funzionerà per word2num("four hundred thousand fifty"), perché non sa come gestire "cento" e "mille" mandati consecutivi, ma l'algoritmo può essere modificato probabilmente. Chiunque dovrebbe sentirsi libero di modificarlo se ha dei miglioramenti o si basa su di essi nella propria risposta. Ho solo pensato che fosse un problema divertente con cui giocare (per un po ').

Modifica: Apparentemente Bill Venables ha un pacchetto chiamato english che può raggiungere questo risultato anche meglio del codice precedente.

+0

cercando di vedere dove il pacchetto inglese può fare questo. Sembra solo andare dall'altra parte ma forse mi manca questo? –

-1

Ecco quello che penso sia una soluzione migliore.

library(stringdist) 
    library(gdata) 
    #Convert numeric words to digits 
isNumericWord=function(string, dist=1, method="dl"){ 
    nums=c("zero","one","two","three","four","five","six","seven","eight","nine", 
     "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", 
     "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", 
     "hundred","thousand","million","billion","trillion") 
    return(any(stringdist(tolower(string),nums,method=method)<=dist)) 
} 
numberTypes=function(string, dist=1, method="dl"){ 
    nums=c("zero","one","two","three","four","five","six","seven","eight","nine", 
     "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", 
     "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", 
     "hundred","thousand","million","billion","trillion") 
    string=gsub("[[:punct:]]"," ",string) 
    wrdsplit=strsplit(string,split=" ")[[1]] 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    #Handle number types 
    wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit) 
    wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit) 
    wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit) 
    wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit) 
    wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit) 
    wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit) 
    wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit) 
    wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit) 
    wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit) 
    wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit) 
    wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit) 
    #Handle other number words that end in "th" 
    if(length(wrdsplit)>0){ 
    for(i in 1:length(wrdsplit)){ 
     substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i])) 
     substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2)) 
     if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){ 
     wrdsplit[i]=paste(substr_beg, substr_end,sep=" ") 
     } 
    } 
    return(gsub(" "," ",paste(wrdsplit,collapse=" "))) 
    }else{ 
    return("") 
    } 
} 

#Convert number words to digits 
Word2Num=function(string, dist=1, method="dl"){ 
    original=string 
    #Define numbers 
    one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5, 
        six=6, seven=7, eight=8, nine=9) 
    teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, 
       sixteen=16, seventeen=17, eighteen=18, nineteen=19) 
    ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, 
        sixty=60, seventy=70, eighty=80, ninety=90) 
    large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12) 
    double_digits = c(teens,ten_digits) 

    #Split the string into words 
    string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T)) 
    string=numberTypes(string) 
    wrdsplit=strsplit(tolower(string)," ")[[1]] 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    isNumber=apply(data.frame(wrdsplit),1,isNumericWord) 

    #Find groups of numbers 
    if(exists("groups")){ 
    suppressWarnings(rm(groups)) 
    } 
    i=1 
    while(i <= length(wrdsplit)){ 
    if(isNumber[i]==T){ 
     if(!exists("groups")){ 
     groups=list(wrdsplit[i]) 
     }else if(exists("groups")){ 
     groups=c(groups, wrdsplit[i]) 
     } 
     for(j in (i+1):length(wrdsplit)){ 
     if(isNumber[j]){ 
      groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j]) 
      i=j+1 
     }else{ 
      i=i+1 
      break 
     } 
     } 
    }else{ 
     i=i+1 
    } 
    } 

    #Convert numeric words to numbers 
    if(exists("groups")){ 
    groupNums=groups 
    for(j in 1:length(groups)){ 
     for(i in 1:length(groups[[j]])){ 
     #If word is a single digit number 
     if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & 
       tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){ 
      #If word is a single digit number 
      groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]] 
     }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){ 
      #If word is a double digit number 
      groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]] 
     }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){ 
      #If word is a large digit number 
      groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]] 
     } 
     } 
    } 

    #Convert the separated numbers to a single number 
    defscipen=options("scipen")[[1]] 
    options(scipen=999) 
    for(i in 1:length(groups)){ 
     if(length(groupNums[[i]])==1){ 
     groupNums[[i]]=as.numeric(groupNums[[i]][1]) 
     }else{ 
     while(length(groupNums[[i]])>=2){ 
      if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){ 
      #If the next word has more digits than the current word, multiply them 
      temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2]) 
      }else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){ 
      #if the next word has less digits than the current word, add them 
      temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2]) 
      } 
      #Combine the results 
      if(length(groupNums[[i]])>2){ 
      groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])]) 
      }else{ 
      groupNums[[i]]=temp 
      } 
     } 
     } 
    } 
    #Recreate the original string 
    groupNums=lapply(groupNums, as.character) 
    options(scipen=defscipen) 
    for(i in 1:length(groups)){ 
     wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1] 
     if(length(groups[[i]]>1)){ 
     wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]="" 
     } 
    } 
    #Combine numbers with their endings 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){ 
     locs=which(wrdsplit %in% unlist(groupNums)) 
     for(i in length(locs):1){ 
     wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="") 
     wrdsplit=wrdsplit[-(locs[i]+1)] 
     } 
    } 
    return(trim(paste(wrdsplit,collapse=" "))) 
    }else{ 
    return(original) 
    } 
}