-->

[removed] Find exactly 10 words in a prefix tree t

2019-02-27 10:34发布

问题:

I have a trie (also called a prefix tree). Given a prefix, I want to get a list of ten words that start with the prefix.

The thing that's unique about this problem is that I only want 10 of the words that start with the given prefix-- not all of them. There are optimizations that can be made, given this.

My code below I know works fine. Each node in the trie has a children property and a this_is_the_end_of_a_word property. For instance, when you insert "hi", this is what the trie looks like:

.

The problem: Given a prefix, I want to get a list of ten words that start with the prefix.

My approach to the problem was this: Head down the prefix tree, following the characters of prefix until you get to node which corresponds to the last character of prefix. Now you should perform a DFS on this node, keeping track of nodes that have this_is_the_end_of_a_word === true in a list. But you should stop searching when the length of your list is equal to 10, and return the list.

I think my approach is sound, but I'm having trouble implementing it-- particularly because I'm trying to use a recursive DFS, so I'm not sure how to pass the the "global" list around between the recursive calls. I know I should use closures, but I'm new to javascript and I'm unsure about how to go about it. An example of what I tried already is below.

My Trie class (I know this code works, this is just so you can see how I organized my data structure.)

var Trie = function() {

    var that = Object.create(Trie.prototype);
    that.children = {}; //mapping: next character -> child nodes
    that.this_is_the_end_of_a_word = false;

    that.insertWord = function(word) {

        var current_node = that;

        for (var i = 0; i < word.length; i++) {
            var c = word[i]
                //if character is not in the trie already, add it
            if (!(c in current_node.children)) {
                current_node.children[c] = Trie();
            }
            //update current_node
            current_node = current_node.children[c];
        };

        //after adding all the chars of the word, 
        //you are at the end of a word
        current_node.this_is_the_end_of_a_word = true;
    }

    that.insertWords = function(words) {
        for (var i = 0; i < words.length; i++) {
            that.insertWord(words[i]);
        }
    }

    that.contains = function(word) {
        //start at the root
        var current_node = that;
        for (var i = 0; i < word.length; i++) {
            var c = word[i];

            //if the word's character isn't a child of the current_node, 
            //the word isn't in the trie
            if (!(c in current_node.children)) {
                return false;
            }
            //move down the trie, update current_node
            current_node = current_node.children[c];
        };
        return current_node.this_is_the_end_of_a_word;
    }

    Object.freeze(that);
    return that;
}

My 1st approach (has many bugs)

num_words_to_go = 10; 
//this global is bad practice; 
//I want to put this as the argument to a closure 
//so it's passed between recursive calls

that.getWords = function(start_node, prefix) {
   console.log(0);
   var words = [];

   //if start node is a word, add it
   if (start_node.this_is_the_end_of_a_word) {
       words.push(start_node);
       num_words_to_go--;
   }

   if (num_words_to_go <= 0 || !start_node.children) {
       return words;
   }

   return start_node.children.forEach(
                              currentValue.getWords(
                                    currentValue, prefix + <character for this child>)); 

   /*I can't think of a nice way to write this without going through all of the children. 
   I know I don't need to, because I only need to find 10 words and get out. 
   This is why I was leaning towards the recursive DFS. 
   */

}

2nd approach: I also found a python example that I was looking at: http://v1v3kn.tumblr.com/post/18238156967/roll-your-own-autocomplete-solution-using-tries I tried translating his example to JavaScript, but still something's going wrong with all_suffixes.

that.all_suffixes = function (prefix){
    results = [];
    if (that.this_is_the_end_of_a_word) results.push(prefix);
    if (!(that.children)) return results;
    if (results.length > 2) return results;
    var callback = function(currentValue, i, array){
        return currentValue.all_suffixes(prefix+array[i]);
    }
    arr = that.children.forEach(callback, that);
        //[child.all_suffixes(prefix + char) for (char, child) in self.children.items()]
    return concat(reduce(concat, arr), results);        
}

 that.autocomplete = function(prefix){
    current_node = that;
    for(var i = 0; i < prefix.length; i++){
        var c = prefix[i];
        //if there is nothing in the trie with this prefix
        if (!(c in current_node.children)){
            return [];
        }
        current_node = current_node.children[c];
    }
    return list(current_node.all_suffixes(prefix))
 }

回答1:

Basically I take your model and apply a new method getWords(word[, count]) to the Trie class. I changed the method contains because I need the functionality in getWords as well. So I created a new method getNode, which returns the node where the word or part is found.

The method getWords first looks up the word (part) and then iterates through the data structure. When a word is found it is pushed to the result set. If the result set length is greater or equal to the required length, then the iteration (hence Array.prototype.some) is terminated and the recursive call of fork is stopped.

    that.getWords = function (word, count) {

        function fork(n, w) {

            function child(c) {
                return fork(n.children[c], w + c);
            }

            n.isWord && words.push(w);
            return words.length >= count || Object.keys(n.children).some(child);
        }

        var words = [],
            current_node = that.getNode(word);

        if (current_node) {
            fork(current_node, word);
            return words;
        }
    }

Side note: I changed this_is_the_end_of_a_word to isWord.

Input

  1. Create a new instance of Trie.
  2. Insert some words for testing.

Output

  1. Test if the trie contains 'motor', returns false.
  2. Test if the trie contains 'te', returns false.
  3. Test if the trie contains 'ten', returns true.
  4. Get all words that starts with 'ind' (8 available, shows 8).
  5. Get first 10 words that starts with 'in' (16 available, shows 10).
  6. The whole trie.

var Trie = function () {

    var that = Object.create(Trie.prototype);
    that.children = {}; //mapping: next character -> child nodes
    that.isWord = false;

    that.insertWord = function (word) {
        var current_node = that;
        for (var i = 0; i < word.length; i++) {
            var c = word[i]
            //if character is not in the trie already, add it
            if (!(c in current_node.children)) {
                current_node.children[c] = Trie();
            }
            //update current_node
            current_node = current_node.children[c];
        };

        //after adding all the chars of the word,
        //you are at the end of a word
        current_node.isWord = true;
    }

    that.insertWords = function (words) {
        for (var i = 0; i < words.length; i++) {
            that.insertWord(words[i]);
        }
    }

    that.getNode = function (word) {
        //start at the root
        var current_node = that;
        for (var i = 0; i < word.length; i++) {
            var c = word[i];

            //if the word's character isn't a child of the current_node,
            //the word isn't in the trie
            if (!(c in current_node.children)) {
                return;
            }
            //move down the trie, update current_node
            current_node = current_node.children[c];
        };
        return current_node;
    }

    that.contains = function (word) {
        var current_node = that.getNode(word);
        if (current_node) {
            return current_node.isWord;
        }
        return false;
    }

    that.getWords = function (word, count) {

        function fork(n, w) {

            function child(c) {
                return fork(n.children[c], w + c);
            }

            n.isWord && words.push(w);
            return words.length >= count || Object.keys(n.children).some(child);
        }

        var words = [],
            current_node = that.getNode(word);

        if (current_node) {
            fork(current_node, word);
            return words;
        }
    }

    // freeze does lock the isWord property, which is not required here
    //Object.freeze(that);
    return that;
}

var trie = new Trie();
trie.insertWords([
    'car', 'cool', 'i', 'in', 'indeed', 'independence', 'india', 'indoor', 'induction',
    'industrial', 'industry', 'indwell', 'inferior', 'informal', 'inhale', 'inn',
    'inside', 'instance', 'intrepid', 'of', 'off', 'other', 'tea', 'ted', 'ten',
    'to', 'zoo', 'zoom'
]);
document.write(trie.contains('motor') + '<br>'); // false
document.write(trie.contains('te') + '<br>'); // false
document.write(trie.contains('ten') + '<br>'); // true
document.write('<pre>' + JSON.stringify(trie.getWords('ind'), 0, 4) + '</pre>');
document.write('<pre>' + JSON.stringify(trie.getWords('in', 10), 0, 4) + '</pre>');
document.write('<pre>' + JSON.stringify(trie, 0, 4) + '</pre>');