KMP算法，前缀树和AC自动机

基础知识

单模式匹配

KMP算法(Knuth-Morris-Pratt算法)

class Solution {
public:
    int strStr(string haystack, string needle) {
        if(haystack.empty() && needle.empty()) return 0;
        if(haystack.empty() && !needle.empty()) return -1;
        if(!haystack.empty() && needle.empty()) return 0;
        //compute the next array
        auto pi = vector<size_t>(needle.length());
        pi[0] = 0;
        size_t k = 0;
        for(size_t q = 1; q < needle.length(); ++q){
            while(needle[k] != needle[q] && k > 0) k = pi[k - 1];
            if(needle[k] == needle[q]) ++k;
            pi[q] = k;
        }
        k = 0;
        for(size_t i = 0; i < haystack.length(); ++i){
            while(haystack[i] != needle[k] && k > 0) k = pi[k - 1];
            if(haystack[i] == needle[k]) ++k;
            if(k == needle.length()) return i - k + 1;
        }
        return -1;
    }
};

BM算法(Boyer-Moore算法)

多模式匹配

Tries(字典树)

原理

TODO
LC208 Implement Trie (Prefix Tree)
看了很多这个题的答案，没有人写析构函数。自己dynamically allocated的内存，用完就给扔那里了。这个类的对象析构之后，所有申请的内存全泄漏了。Leetcode也不检查一下语法(估计也没法检查？)。

下面给出几个没有内存泄漏的解法：
使用原生指针的实现

class Trie {
public:
    /** Initialize your data structure here. */
    Trie()
    :root{new TrieNode{}}
    {}
    
    ~Trie(){
        delete root;
    }
    
    /** Inserts a word into the trie. */
    void insert(string word) {
        TrieNode* p = root;
        for(auto c : word){
            auto i = static_cast<size_t>(c - 'a');
            if(p->children[i] == nullptr) p->children[i] = new TrieNode{};
            p = p->children[i];             
        }
        p->isWord = true;
    }
    /** Returns if the word is in the trie. */
    bool search(string word) {
       TrieNode* p = root;
        for(auto c : word){
            p = p->children[c - 'a'];
            if(!p) return false;
        }
        if(p->isWord) return true;
        else          return false;
    }
    
    /** Returns if there is any word in the trie that starts with the given prefix. */
    bool startsWith(string prefix) {
        TrieNode* p = root;
        for(auto c : prefix){
            p = p->children[c - 'a'];
            if(!p) return false;
        }
        return true;
    }
private:
    class TrieNode{
    public:
        bool isWord;
        array<TrieNode*, 26> children;
        
        TrieNode()
        :isWord{false}
        ,children{nullptr}
        {}
        
        ~TrieNode(){
            for(auto ptr : children){
                delete ptr;
            }
        }
    };    
    TrieNode* root;   
};

使用shared_ptr的实现，非常的慢。

class Trie {
    class TrieNode;
    using TrieNodePtr = shared_ptr<TrieNode>;
public:
    /** Initialize your data structure here. */
    Trie()
    :root{}
    {
        root = make_shared<TrieNode>();
    }

    
    /** Inserts a word into the trie. */
    void insert(string word) {
        TrieNodePtr p = root;
        for(auto c : word){
            auto i = static_cast<size_t>(c - 'a');
            if(p->children[i] == nullptr) 
                p->children[i] = make_unique<TrieNode>();
            p = p->children[i];             
        }
        p->isWord = true;
    }
    /** Returns if the word is in the trie. */
    bool search(string word) {
       TrieNodePtr p = root;
        for(auto c : word){
            p = p->children[c - 'a'];
            if(!p) return false;
        }
        if(p->isWord) return true;
        else          return false;
    }
    
    /** Returns if there is any word in the trie that starts with the given prefix. */
    bool startsWith(string prefix) {
        TrieNodePtr p = root;
        for(auto c : prefix){
            p = p->children[c - 'a'];
            if(!p) return false;
        }
        return true;
    }
private:
    class TrieNode{
    public:
        TrieNode()
        :isWord{false}
        ,children{nullptr}
        {}
        bool isWord;
        array<TrieNodePtr, 26> children;
    };
    TrieNodePtr root;  
};

另一个没有内存泄漏的实现：Accepted cpp solution without memory leak - tju_xu

AC算法(Aho-Corasick算法,字符串匹配自动机)

多模字符串匹配算法之AC自动机—原理与实现
 AC自动机 - 司徒正美

思路：

什么是fail指针？
非常类似KMP算法里的next数组。next数组里存的是$next[q] = max{k : P_k 是 P_q 的后缀}$。而fail指针所指的是：$P,Q \in {所有模式串}, max{k : Q_k 是 P_q 的后缀}$的$Q_k$. 即【从根节点开始到该节点的字符串的后缀】和【所有模式串的前缀】的最长公共子串。
如何构造一个AC自动机？
- 把所有的模式构造成一个Trie. 这个Trie的所有TrieNode要有isWord标志和fail指针(失配指针).
- 之后我们来为所有TrieNode来弄fail指针(失配指针).
  - root的fail指针赋值为nullptr.
  - 我们首先把root的所有孩子节点的fail指针赋值为root.因为如果一个字符串第一个字符都无法匹配，那这样的字符串就只能匹配空字符$\varepsilon$，所以回到root.
  - 之后将root的所有孩子放入队列，开始对Trie做BFS.
  - 从队列中pop出来一个，来为它(root)的所有孩子(child[i])找fail指针
    - 设置toFail初始为root->fail.
    - 如果toFail == nullptr
      这就是回到了root了，只能匹配空串了。设置root->childs[i]->fail = root。
    - 如果toFail->childs[i] != nullptr
      类似KMP算法，说明此时代表最长公共部分的root->fail还可以增长。那么直接设置root->childs[i]->fail = toFail>childs[i]即可
    - 如果toFail->childs[i] == nullptr
      类似KMP算法，我们需要找比这个要小的最长公共部分，所以设置toFail = toFail->fail后，再回到开头。
  - 把一个TrieNode的所有child的fail指着都设置好后，我们把该TrieNode弹出队列。
  - 一直循环直到队列为空（BFS做完）。
如何在AC自动机里做查询？
- 初始化当前指针currPtr = root
- 首先从文本中读入一个字符c
- 循环开始
- 如果c是currPtr的孩子，自动机沿着主线进入下一个TrieNode(currPtr = currPtr->child[c - 'a'])
  - 如果currPtr->isWord那么证明我们找到了一个匹配。
  - 如果!currPtr->isWord
    - 如果currPtr->fail->isWord 我们也找到了一个匹配。当前!isword不代表没有匹配，有可能其最长的后缀是一个匹配。
  - 没有匹配任何模式，读入下一个字符串
- 如果c不是currPtr的孩子，自动机需要沿着fail指针(失配指针)进入下一个TrieNode(currPtr = currPtr->fail)
  - 如果currPtr == nullptr，那就是第一个字符都无法匹配，没有匹配任何模式。
  - 回到循环头部。

LC1032 Stream of Characters

class Trie{
    class TrieNode;
    constexpr static size_t NUM_OF_LETTERS = 26;
public:
    Trie()
    :root{new TrieNode{}}
    ,currPtr{root}
    {}   
    Trie(vector<string>& words)
    :root{new TrieNode{}}
    ,currPtr{root}
    {
        for(auto& word : words) insert(word);
        buildAC();
    }
    ~Trie(){
        delete root;
    }
   
    void insert(const string& word){
        auto p = root;
        for(auto c : word){
            auto i = static_cast<size_t>(c - 'a');
            if(!p->childs[i]) p->childs[i] = new TrieNode{};
            p = p->childs[i];
        }
        p->isWord = true;
    }
    
    void buildAC(){
        auto q = queue<TrieNode*>{};
        for(auto ptr : root->childs){
            if(ptr){
                ptr->fail = root;
                q.push(ptr); 
            }
        }
        while(!q.empty()){
            auto curr = q.front();
            for(size_t i = 0; i < NUM_OF_LETTERS; ++i){
                if(curr->childs[i]){
                    auto toFail = curr->fail;
                    while(true){                       
                        if(!toFail){
                            curr->childs[i]->fail = root;
                            break;
                        }
                        if(toFail->childs[i]){
                            curr->childs[i]->fail = toFail->childs[i];
                            break;
                        }
                        else toFail = toFail->fail;                           
                    } 
                    q.push(curr->childs[i]);
                }
            }
            q.pop();
        }   
    }
    
    inline bool nextState(char c) noexcept{
        auto i = static_cast<size_t>(c - 'a');
        while(true){
            if(currPtr->childs[i]){
                currPtr = currPtr->childs[i];
                if(currPtr->isWord || currPtr->fail->isWord) return true;
                return false;
            }else{
                currPtr = currPtr->fail;
                if(!currPtr) {currPtr = root; return false;}
            }
        }
    }
    
    
private:
    TrieNode* root;
    TrieNode* currPtr;
    
    class TrieNode{
    public:
        array<TrieNode*, NUM_OF_LETTERS> childs;
        TrieNode* fail;
        bool isWord;

        TrieNode()
        :childs{nullptr}
        ,fail{nullptr}
        ,isWord{false}
        {}

        ~TrieNode(){
            for(auto ptr : childs) delete ptr;
        }
    };
    
};

class StreamChecker {
public:
    StreamChecker(vector<string>& words)
    :trie{new Trie{words}}
    {}
    
    ~StreamChecker(){
        delete trie;
    }
    
    bool query(char letter) {
        return trie->nextState(letter);
    }
private:
    Trie* trie;
};

最长回文子串(Manacher算法)

LC5, Longest Palindromic Substring

解法详见:

class Solution {
public:
    string longestPalindrome(string s) {
        string str = Str2ManacherStr(s);
        vector<size_t> p(str.length(), 0);
        size_t c = 0;
        size_t r = 0;
        size_t max_c = 0;
        size_t max_r = 0;

        for(size_t i = 1; i < str.length(); ++i)
        {
            if (i < r)
                p[i] = min(r - i, p[c - (i - c)]);
            else
                p[i] = 1;

            while(str[i + p[i]] == str[i - p[i]]) ++p[i];

            if (r < i + p[i])
            {
                r = i + p[i];
                c = i;
            }

            if (max_r < p[i])
            {
                max_r = p[i];
                max_c = i;
            }

        }

        return ManacherStr2Str(str.substr(max_c - max_r + 1, 2 * max_r - 1));
    }

    string Str2ManacherStr(const string& s)
    {
        stringstream ss{};
        ss<<"$#";
        copy(s.begin(), s.end(), ostream_iterator<char>(ss,"#"));
        return ss.str();
    }

    string ManacherStr2Str(const string& s)
    {
        stringstream ss{};
        copy_if(s.begin(),
                s.end(),
                ostream_iterator<char>(ss),
                [](char c){return c != '#' && c != '$';});
        return ss.str();
    }
};

正则表达式

正则表达式30分入门教程

元字符	含义
`.`	除`\n`以外的任意字符
`\w`	word: 字母、数字、下划线、汉字
`\s`	space:空格、全角空格、Tab、`\n`
`\d`	decimal:数字
`\b`	单词的开头或结尾：前一个字符或后一个字符，不全是\w
`^`	字符串的开始
`$`	字符串的结束

限定符	含义
`{n}`	出现$r = n$次
`{n,}`	出现$r \in [n, +\infty]$次
`{n,m}`	出现$r \in [n, m]$次
*	出现$r \in [0, \infty]$次
?	出现$r \in {0,1}$次
+	出现$r = [1, +\infty]$次

题目

单模式匹配

LC459 Repeated Substring Pattern

class Solution {
public:
    bool repeatedSubstringPattern(string s) {
        auto len = s.size();
        auto pi = vector<size_t>(len + 1);
        size_t k = 0;
        for(size_t q = 1; q < len; ++q){
            while(s[q] != s[k] && k > 0) k = pi[k];
            if(s[q] == s[k]) ++k;
            pi[q+1] = k;
        }
        return k  && (len % (len - k)) == 0;
    }
};

多模式匹配

Trie(字典树)

Map Sum

LC677 Map Sum Pairs

class MapSum {
    class node;
public:
    /** Initialize your data structure here. */
    MapSum()
    :root{new node{}}
    {}
    ~MapSum(){
        delete root;
    }
    
    void insert(string key, int val) {
        node* p = root;
        for(auto c : key){
            size_t i = static_cast<size_t>(c - 'a');
            if(!p->children[i]) p->children[i] = new node{};
            p = p->children[i];
        }
        p->weight = val;
    }
    
    int sum(string prefix) {
        node* p = root;
        for(auto c : prefix){
            p = p->children[c - 'a'];
            if(!p) return 0;
        }
        return p->_sum(); 
    }
    
private:
    node* root;
    class node{
    public:
        array<node*, 26> children;
        int weight;
        
        int _sum(){
            auto result = weight;
            for(auto ptr : children){
                result += ptr ? ptr->_sum() : 0;
            }
            return result;
        }
        
        node()
        :children{nullptr}
        ,weight(0)
        {}
        
        ~node(){
            for(auto ptr : children) delete ptr;
        }
    };
};

Add and Search Word

LC211 Add and Search Word - Data structure design

思路：

把所有的word都放到一个Trie里
做前序遍历
- 检查string的迭代器是不是到头了，如果到头了，并且该TrieNode还是isWord，就搜到了。
- 如果遇到'.'我们就搜索所有子树。
- 如果遇到其他字符我们就只搜索该字符。

class WordDictionary {
class node;
public:
    /** Initialize your data structure here. */
    WordDictionary()
    :root{new node{}}
    {}
    
    ~WordDictionary(){
        delete root;
    }
    
    /** Adds a word into the data structure. */
    void addWord(string word) {
        auto p = root;
        for(auto c : word){
            auto i = static_cast<size_t>(c - 'a');
            if(!p->children[i])
                p->children[i] = new node{};
            p = p->children[i];
        }
        p->isWord = true;
    }
    
    /** Returns if the word is in the data structure. A word could contain the dot character '.' to represent any one letter. */
    bool search(string word) {
        return searchImpl(root, word, word.begin());
    }
private:
    node* root;
    template<typename It>
    bool searchImpl(node* root, const string& word, It c){
        if(c == word.end()) return root->isWord;
        if(*c == '.'){
            for(auto ptr : root->children){
                if(ptr && searchImpl(ptr, word, c+1)) return true;    
            }
            return false;
        }
        if(root->children[*c - 'a']){
            return searchImpl(root->children[*c - 'a'], word, c+1);
        }
        else 
            return false;
    }
    
    
    class node{
    public:
        array<node*, 26> children;
        bool  isWord;
        
        node()
        :children{nullptr}
        ,isWord{false}
        {}
        
        ~node(){
            for(auto ptr : children) delete ptr;
        }
    };
};

LC720 Longest Word in Dictionary
思路：

把所有的word都放进Trie

按字典序做DFS前序遍历

如果该节点isWord并且这个时候word比max_word(result)长，就用这个word代替result即可。(这么做的原因是我们按【字典序】对Trie做的搜索，每个长度的word只要被搜出来，就一定是字典序第一个。)

如果该节点是isWord，继续搜索他的所有子树。

class Trie{
    constexpr static size_t NUM_OF_LETTERS = 26;
    constexpr static size_t ASCII_OF_a = 97;
    class TrieNode;
public:
    Trie()
    :root{new TrieNode{}}
    {}

    ~Trie(){
    	delete root;
    }

    void insert(const string& word){
        auto p = root;
        for(auto c : word){
            auto i = static_cast<size_t>(c - 'a');
            if(!p->childs[i]) p->childs[i] = new TrieNode{};
            p = p->childs[i];
        }
        p->isWord = true;
    }
    string dfs(){
        auto word   = string{};
        auto result = string{};
        dfsImpl(root, word, result);  
        return result;
    }
private:
    TrieNode* root;
    void dfsImpl(TrieNode* root, string& word, string& result){
        if(root->isWord && word.size() > result.size())
             result = word;
        if(root->isWord || root == this->root){
            for(size_t i = 0; i < NUM_OF_LETTERS; ++i){
                if(root->childs[i]){
                    cout<<static_cast<char>(ASCII_OF_a + i)<<endl;
                    word.push_back(static_cast<char>(ASCII_OF_a + i));
                    dfsImpl(root->childs[i], word, result);
                    word.pop_back();
                }
            }
        }
    }
    class TrieNode{
    public:
        array<TrieNode*, NUM_OF_LETTERS> childs;
        bool isWord;
        
        TrieNode()
        :childs{nullptr}
        ,isWord{false}
        {}
        
        ~TrieNode(){
            for(auto ptr : childs) delete ptr;
        }
    };
};
class Solution {
public:
    string longestWord(vector<string>& words) {
        auto trie = Trie{};
        for(auto word : words) trie.insert(word);
        return trie.dfs();
    }
};

正则表达式匹配

由*和.构成的正则表达式匹配

Regular Expression Matching
思路：
不能想着去构造DFA的方法去做这样的题，时间是不可能够的。直接用动态规划的思想。

从文本里拿出来一个字符，和模式匹配

如果文本字符是$\varepsilon$(空)，且模式也是$\varepsilon$，则匹配成功。
如果文本字符不是$\varepsilon$，但模式是$\varepsilon$，匹配失败。
如果模式字符的下一个字符是*
- 如果文本字符和模式字符匹配或文本字符不是$\varepsilon$的同时，模式字符是.
  - 我们可以忽略这个a*，让文本停留在当前字符，模式向后移动两个。
  - 我们可以不忽略这个a*，让匹配停留在该状态，即文本向后移动一个字符，模式不移动。
- 如果文本字符和模式字符不匹配，我们此时只能忽略a*，即让文本停留在当前字符，模式向后移动两个字符。

如果模式字符的下一个字符不是*

如果文本字符和模式字符匹配或文本字符不是$\varepsilon$的同时，模式字符是.，我们可以让文本及模式都向后移动一个字符。

否则匹配失败。

class Solution {
public:
    bool isMatch(string a, string b) {
        auto t = a.c_str();
        auto p = b.c_str();
        return isMatchImpl(t,p);
    }
    
    bool isMatchImpl(const char* t, const char* p){
        if(*t == '\0' && *p == '\0') return true;
        if(*t != '\0' && *p == '\0') return false;
        if(p + 1 && *(p + 1) == '*'){
            if(*t == *p || (*t != '\0' && *p == '.'))
                return isMatchImpl(t, p + 2) || isMatchImpl(t + 1, p);
            else
                return isMatchImpl(t, p + 2);
        }
        if(*t == *p || (*t != '\0' && *p == '.')) 
            return isMatchImpl(t + 1, p + 1);
        return false;
    }
};

有效的数字表达式

LC65 Valid Number

其他

LC425 Word Squares premium