Convierte un número escrito en un número en R

¿Alguien conoce una función para convertir una representación de texto de un número en un número real, por ejemplo, “veinte mil trescientos cinco” en 20305. He escrito números en filas de marcos de datos y quiero convertirlos en números.

En el paquete qdap, puede reemplazar los números representados numéricamente con palabras (por ejemplo, 1001 se convierte en mil uno), pero no al revés:

library(qdap) replace_number("I like 346457 ice cream cones.") [1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones." 

Este es un comienzo que debería llevarlo a cientos de miles.

 word2num < - function(word){ wsplit <- strsplit(tolower(word)," ")[[1]] one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7, eight=8, nine=9) teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, sixteen=16, seventeen=17, eighteen=18, nineteen=19) ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, sixty=60, seventy=70, eighty=80, ninety=90) doubles <- c(teens,ten_digits) out <- 0 i <- 1 while(i <= length(wsplit)){ j <- 1 if(i==1 && wsplit[i]=="hundred") temp <- 100 else if(i==1 && wsplit[i]=="thousand") temp <- 1000 else if(wsplit[i] %in% names(one_digits)) temp <- as.numeric(one_digits[wsplit[i]]) else if(wsplit[i] %in% names(teens)) temp <- as.numeric(teens[wsplit[i]]) else if(wsplit[i] %in% names(ten_digits)) temp <- (as.numeric(ten_digits[wsplit[i]])) if(i < length(wsplit) && wsplit[i+1]=="hundred"){ if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) out < - out + 100*temp else out <- 100*(out + temp) j <- 2 } else if(i < length(wsplit) && wsplit[i+1]=="thousand"){ if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) out < - out + 1000*temp else out <- 1000*(out + temp) j <- 2 } else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){ temp <- temp*100 out <- out + temp } else{ out <- out + temp } i <- i + j } return(list(word,out)) } 

Resultados:

 > word2num("fifty seven") [[1]] [1] "fifty seven" [[2]] [1] 57 > word2num("four fifty seven") [[1]] [1] "four fifty seven" [[2]] [1] 457 > word2num("six thousand four fifty seven") [[1]] [1] "six thousand four fifty seven" [[2]] [1] 6457 > word2num("forty six thousand four fifty seven") [[1]] [1] "forty six thousand four fifty seven" [[2]] [1] 46457 > word2num("forty six thousand four hundred fifty seven") [[1]] [1] "forty six thousand four hundred fifty seven" [[2]] [1] 46457 > word2num("three forty six thousand four hundred fifty seven") [[1]] [1] "three forty six thousand four hundred fifty seven" [[2]] [1] 346457 

Ya puedo decirte que esto no funcionará para word2num("four hundred thousand fifty") , porque no sabe cómo manejar términos consecutivos de "cien" y "mil", pero el algoritmo se puede modificar probablemente. Cualquiera debe sentirse libre de editar esto si tienen mejoras o construir sobre ellas en su propia respuesta. Solo pensé que era un problema divertido para jugar (por un tiempo).

Editar: Aparentemente Bill Venables tiene un paquete llamado inglés que puede lograr esto incluso mejor que el código anterior.

Esto es lo que creo que es una mejor solución.

  library(stringdist) library(gdata) #Convert numeric words to digits isNumericWord=function(string, dist=1, method="dl"){ nums=c("zero","one","two","three","four","five","six","seven","eight","nine", "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", "hundred","thousand","million","billion","trillion") return(any(stringdist(tolower(string),nums,method=method)< =dist)) } numberTypes=function(string, dist=1, method="dl"){ nums=c("zero","one","two","three","four","five","six","seven","eight","nine", "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", "hundred","thousand","million","billion","trillion") string=gsub("[[:punct:]]"," ",string) wrdsplit=strsplit(string,split=" ")[[1]] wrdsplit=wrdsplit[wrdsplit!=""] #Handle number types wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit) wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit) wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit) wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit) wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit) wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit) wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit) wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit) wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist & tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit) wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit) wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit) wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit) wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit) wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit) wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit) wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit) wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit) wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit) #Handle other number words that end in "th" if(length(wrdsplit)>0){ for(i in 1:length(wrdsplit)){ substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i])) substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2)) if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)< =dist)){ wrdsplit[i]=paste(substr_beg, substr_end,sep=" ") } } return(gsub(" "," ",paste(wrdsplit,collapse=" "))) }else{ return("") } } #Convert number words to digits Word2Num=function(string, dist=1, method="dl"){ original=string #Define numbers one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7, eight=8, nine=9) teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, sixteen=16, seventeen=17, eighteen=18, nineteen=19) ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, sixty=60, seventy=70, eighty=80, ninety=90) large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12) double_digits = c(teens,ten_digits) #Split the string into words string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T)) string=numberTypes(string) wrdsplit=strsplit(tolower(string)," ")[[1]] wrdsplit=wrdsplit[wrdsplit!=""] isNumber=apply(data.frame(wrdsplit),1,isNumericWord) #Find groups of numbers if(exists("groups")){ suppressWarnings(rm(groups)) } i=1 while(i <= length(wrdsplit)){ if(isNumber[i]==T){ if(!exists("groups")){ groups=list(wrdsplit[i]) }else if(exists("groups")){ groups=c(groups, wrdsplit[i]) } for(j in (i+1):length(wrdsplit)){ if(isNumber[j]){ groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j]) i=j+1 }else{ i=i+1 break } } }else{ i=i+1 } } #Convert numeric words to numbers if(exists("groups")){ groupNums=groups for(j in 1:length(groups)){ for(i in 1:length(groups[[j]])){ #If word is a single digit number if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){ #If word is a single digit number groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]] }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){ #If word is a double digit number groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]] }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){ #If word is a large digit number groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]] } } } #Convert the separated numbers to a single number defscipen=options("scipen")[[1]] options(scipen=999) for(i in 1:length(groups)){ if(length(groupNums[[i]])==1){ groupNums[[i]]=as.numeric(groupNums[[i]][1]) }else{ while(length(groupNums[[i]])>=2){ if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){ #If the next word has more digits than the current word, multiply them temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2]) }else if(nchar(groupNums[[i]][2])2){ groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])]) }else{ groupNums[[i]]=temp } } } } #Recreate the original string groupNums=lapply(groupNums, as.character) options(scipen=defscipen) for(i in 1:length(groups)){ wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1] if(length(groups[[i]]>1)){ wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]="" } } #Combine numbers with their endings wrdsplit=wrdsplit[wrdsplit!=""] if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){ locs=which(wrdsplit %in% unlist(groupNums)) for(i in length(locs):1){ wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="") wrdsplit=wrdsplit[-(locs[i]+1)] } } return(trim(paste(wrdsplit,collapse=" "))) }else{ return(original) } }