Is there a way to convert number words to Integers

2018-12-31 16:24发布

I need to convert one into 1, two into 2 and so on.

Is there a way to do this with a library or a class or anything?

13条回答
与君花间醉酒
2楼-- · 2018-12-31 17:16

This could be easily be hardcoded into a dictionary if there's a limited amount of numbers you'd like to parse.

For slightly more complex cases, you'll probably want to generate this dictionary automatically, based on the relatively simple numbers grammar. Something along the lines of this (of course, generalized...)

for i in range(10):
   myDict[30 + i] = "thirty-" + singleDigitsDict[i]

If you need something more extensive, then it looks like you'll need natural language processing tools. This article might be a good starting point.

查看更多
时光乱了年华
3楼-- · 2018-12-31 17:16

Quick and dirty Java port of e_h's C# implementation (above). Note that both return double, not int.

public class Text2Double {

    public double Text2Double(String text) {

        String[] units = new String[]{
                "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
                "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                "sixteen", "seventeen", "eighteen", "nineteen",
        };

        String[] tens = new String[]{"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"};

        String[] scales = new String[]{"hundred", "thousand", "million", "billion", "trillion"};

        Map<String, ScaleIncrementPair> numWord = new LinkedHashMap<>();
        numWord.put("and", new ScaleIncrementPair(1, 0));


        for (int i = 0; i < units.length; i++) {
            numWord.put(units[i], new ScaleIncrementPair(1, i));
        }

        for (int i = 1; i < tens.length; i++) {
            numWord.put(tens[i], new ScaleIncrementPair(1, i * 10));
        }

        for (int i = 0; i < scales.length; i++) {
            if (i == 0)
                numWord.put(scales[i], new ScaleIncrementPair(100, 0));
            else
                numWord.put(scales[i], new ScaleIncrementPair(Math.pow(10, (i * 3)), 0));
        }

        double current = 0;
        double result = 0;

        for(String word : text.split("[ -]"))
        {
            ScaleIncrementPair scaleIncrement = numWord.get(word);
            current = current * scaleIncrement.scale + scaleIncrement.increment;
            if (scaleIncrement.scale > 100) {
                result += current;
                current = 0;
            }
        }
        return result + current;
    }
}

public class ScaleIncrementPair
{
    public double scale;
    public int increment;

    public ScaleIncrementPair(double s, int i)
    {
        scale = s;
        increment = i;
    }
}
查看更多
后来的你喜欢了谁
4楼-- · 2018-12-31 17:17

I needed something a bit different since my input is from a speech-to-text conversion and the solution is not always to sum the numbers. For example, "my zipcode is one two three four five" should not convert to "my zipcode is 15".

I took Andrew's answer and tweaked it to handle a few other cases people highlighted as errors, and also added support for examples like the zipcode one I mentioned above. Some basic test cases are shown below, but I'm sure there is still room for improvement.

def is_number(x):
    if type(x) == str:
        x = x.replace(',', '')
    try:
        float(x)
    except:
        return False
    return True

def text2int (textnum, numwords={}):
    units = [
        'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
        'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
        'sixteen', 'seventeen', 'eighteen', 'nineteen',
    ]
    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
    scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    if not numwords:
        numwords['and'] = (1, 0)
        for idx, word in enumerate(units): numwords[word] = (1, idx)
        for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ''
    onnumber = False
    lastunit = False
    lastscale = False

    def is_numword(x):
        if is_number(x):
            return True
        if word in numwords:
            return True
        return False

    def from_numword(x):
        if is_number(x):
            scale = 0
            increment = int(x.replace(',', ''))
            return scale, increment
        return numwords[x]

    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
            lastunit = False
            lastscale = False
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if (not is_numword(word)) or (word == 'and' and not lastscale):
                if onnumber:
                    # Flush the current number we are building
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
                lastunit = False
                lastscale = False
            else:
                scale, increment = from_numword(word)
                onnumber = True

                if lastunit and (word not in scales):                                                                                                                                                                                                                                         
                    # Assume this is part of a string of individual numbers to                                                                                                                                                                                                                
                    # be flushed, such as a zipcode "one two three four five"                                                                                                                                                                                                                 
                    curstring += repr(result + current)                                                                                                                                                                                                                                       
                    result = current = 0                                                                                                                                                                                                                                                      

                if scale > 1:                                                                                                                                                                                                                                                                 
                    current = max(1, current)                                                                                                                                                                                                                                                 

                current = current * scale + increment                                                                                                                                                                                                                                         
                if scale > 100:                                                                                                                                                                                                                                                               
                    result += current                                                                                                                                                                                                                                                         
                    current = 0                                                                                                                                                                                                                                                               

                lastscale = False                                                                                                                                                                                                              
                lastunit = False                                                                                                                                                
                if word in scales:                                                                                                                                                                                                             
                    lastscale = True                                                                                                                                                                                                         
                elif word in units:                                                                                                                                                                                                             
                    lastunit = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

Some tests...

one two three -> 123
three forty five -> 345
three and forty five -> 3 and 45
three hundred and forty five -> 345
three hundred -> 300
twenty five hundred -> 2500
three thousand and six -> 3006
three thousand six -> 3006
nineteenth -> 19
twentieth -> 20
first -> 1
my zip is one two three four five -> my zip is 12345
nineteen ninety six -> 1996
fifty-seventh -> 57
one million -> 1000000
first hundred -> 100
I will buy the first thousand -> I will buy the 1000  # probably should leave ordinal in the string
thousand -> 1000
hundred and six -> 106
1 million -> 1000000
查看更多
人间绝色
5楼-- · 2018-12-31 17:20

A quick solution is to use the inflect.py to generate a dictionary for translation.

inflect.py has a number_to_words() function, that will turn a number (e.g. 2) to it's word form (e.g. 'two'). Unfortunately, its reverse (which would allow you to avoid the translation dictionary route) isn't offered. All the same, you can use that function to build the translation dictionary:

>>> import inflect
>>> p = inflect.engine()
>>> word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
...     word_form = p.number_to_words(i)  # 1 -> 'one'
...     word_to_number_mapping[word_form] = i
...
>>> print word_to_number_mapping['one']
1
>>> print word_to_number_mapping['eleven']
11
>>> print word_to_number_mapping['forty-three']
43

If you're willing to commit some time, it might be possible to examine inflect.py's inner-workings of the number_to_words() function and build your own code to do this dynamically (I haven't tried to do this).

查看更多
步步皆殇っ
6楼-- · 2018-12-31 17:20

There's a ruby gem by Marc Burns that does it. I recently forked it to add support for years. You can call ruby code from python.

  require 'numbers_in_words'
  require 'numbers_in_words/duck_punch'

  nums = ["fifteen sixteen", "eighty five sixteen",  "nineteen ninety six",
          "one hundred and seventy nine", "thirteen hundred", "nine thousand two hundred and ninety seven"]
  nums.each {|n| p n; p n.in_numbers}

results:
"fifteen sixteen" 1516 "eighty five sixteen" 8516 "nineteen ninety six" 1996 "one hundred and seventy nine" 179 "thirteen hundred" 1300 "nine thousand two hundred and ninety seven" 9297

查看更多
步步皆殇っ
7楼-- · 2018-12-31 17:28

The majority of this code is to set up the numwords dict, which is only done on the first call.

def text2int(textnum, numwords={}):
    if not numwords:
      units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
      ]

      tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

      scales = ["hundred", "thousand", "million", "billion", "trillion"]

      numwords["and"] = (1, 0)
      for idx, word in enumerate(units):    numwords[word] = (1, idx)
      for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
      for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    current = result = 0
    for word in textnum.split():
        if word not in numwords:
          raise Exception("Illegal word: " + word)

        scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return result + current

print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven")
#7100031337
查看更多
登录 后发表回答