Ecco quello che penso sia una soluzione migliore.
library(stringdist)
library(gdata)
#Convert numeric words to digits
isNumericWord=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
return(any(stringdist(tolower(string),nums,method=method)<=dist))
}
numberTypes=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
string=gsub("[[:punct:]]"," ",string)
wrdsplit=strsplit(string,split=" ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
#Handle number types
wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
#Handle other number words that end in "th"
if(length(wrdsplit)>0){
for(i in 1:length(wrdsplit)){
substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
}
}
return(gsub(" "," ",paste(wrdsplit,collapse=" ")))
}else{
return("")
}
}
#Convert number words to digits
Word2Num=function(string, dist=1, method="dl"){
original=string
#Define numbers
one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
six=6, seven=7, eight=8, nine=9)
teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
sixteen=16, seventeen=17, eighteen=18, nineteen=19)
ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
sixty=60, seventy=70, eighty=80, ninety=90)
large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
double_digits = c(teens,ten_digits)
#Split the string into words
string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
string=numberTypes(string)
wrdsplit=strsplit(tolower(string)," ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
isNumber=apply(data.frame(wrdsplit),1,isNumericWord)
#Find groups of numbers
if(exists("groups")){
suppressWarnings(rm(groups))
}
i=1
while(i <= length(wrdsplit)){
if(isNumber[i]==T){
if(!exists("groups")){
groups=list(wrdsplit[i])
}else if(exists("groups")){
groups=c(groups, wrdsplit[i])
}
for(j in (i+1):length(wrdsplit)){
if(isNumber[j]){
groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
i=j+1
}else{
i=i+1
break
}
}
}else{
i=i+1
}
}
#Convert numeric words to numbers
if(exists("groups")){
groupNums=groups
for(j in 1:length(groups)){
for(i in 1:length(groups[[j]])){
#If word is a single digit number
if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist &
tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
#If word is a single digit number
groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
#If word is a double digit number
groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
#If word is a large digit number
groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
}
}
}
#Convert the separated numbers to a single number
defscipen=options("scipen")[[1]]
options(scipen=999)
for(i in 1:length(groups)){
if(length(groupNums[[i]])==1){
groupNums[[i]]=as.numeric(groupNums[[i]][1])
}else{
while(length(groupNums[[i]])>=2){
if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
#If the next word has more digits than the current word, multiply them
temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
}else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
#if the next word has less digits than the current word, add them
temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
}
#Combine the results
if(length(groupNums[[i]])>2){
groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
}else{
groupNums[[i]]=temp
}
}
}
}
#Recreate the original string
groupNums=lapply(groupNums, as.character)
options(scipen=defscipen)
for(i in 1:length(groups)){
wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
if(length(groups[[i]]>1)){
wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
}
}
#Combine numbers with their endings
wrdsplit=wrdsplit[wrdsplit!=""]
if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
locs=which(wrdsplit %in% unlist(groupNums))
for(i in length(locs):1){
wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
wrdsplit=wrdsplit[-(locs[i]+1)]
}
}
return(trim(paste(wrdsplit,collapse=" ")))
}else{
return(original)
}
}
@Henk Ho riscritto la tua domanda un po 'per rendere più chiaro che è necessario convertire le parole in numero e non viceversa. –
Penso che la cosa migliore da fare sia sparare alla persona che ha inviato un file con numeri scritti come parole. Ok, seriamente, dubito che ci sia un modo per farlo se non quello di scrivere un algoritmo di parsing piuttosto dettagliato che ha un enorme database di tutte le parole numeriche ('uno', 'due', ... 'cento', 'mille,' '...' googol ') così come una specie di albero-sorter per la precedenza. Ad esempio, nel tuo esempio ci sono due "cento", ma hanno significati diversi basati sulle parole che li seguono in sequenza. –