Thursday, November 6, 2014

Regular Expression Matching

the res[i][j] means preceding substring of length i of s and length j of p. For any two substrings, res[i][j] is true if and only if one of the following cases is satisfied:
case A: the j th character of p is not '*'
  • case 1: res[i-1][j-1] is true, and ith character of s is equal to j th character of p. Or j th character of p is '.'
case B: the j th character of p is '*'
  • case 2:res[i-1][j] is true, the preceding character of '*' matches incoming character of s, and the pattern like (a*) will match one or more a.
  • case 3: res[i][j-2] is true, and the pattern like (a*) will match an empty string
base case is the res[0][0], res[i][0], res[0][j].

note that in the above algorithm description, the index i, j starts from 1.

    bool isMatch(const char *s, const char *p) {
        if (*p == '\0') return *s == '\0';  //empty
        int lens=strlen(s);
        int lenp=strlen(p);
        bool res[lens+1][lenp+1];

        for(int i=0;i<lens+1;i++)
            for(int j=0;j<lenp+1;j++)
                res[i][j]=false;
       
        res[0][0]=true;//base case
        for(int j=0;j<lenp;j++){//base case of res[0][j+1]
               if(p[j]=='*'){
                 if(j>0&&res[0][j-1]) res[0][j+1]=true;
               }
        }
        for(int j=0;j<lenp;j++)// the position of these two lines can be changed.
        for(int i=0;i<lens;i++)
         {
             if(p[j]=='*'){
                if(j<1) cout<<"error input for p"<<endl;
                 if(res[i+1][j-1]||(res[i][j+1]&&(p[j-1]=='.' || p[j-1]==s[i])))
                    res[i+1][j+1]=true;         
             }
             else{
                    if(p[j]=='.'||p[j]==s[i])
                         res[i+1][j+1]=res[i][j];
            }      
         }
        return res[lens][lenp];
    }

//same idea , use only O(m) space
use OPT and PRE array to save the current optimal and previous optimal to avoid 2 dimensional space.

    bool isMatch(const char *s, const char *p) {
        if (*p == '\0') return *s == '\0';  //empty
        int m = strlen(p), n = strlen(s);
        bool * OPT = new bool[m+1];
        bool * PRE = new bool[m+1];
        PRE[0] = true;//base case
        for (int j = 1; j <= m; ++j)//base case
            PRE[j] = (j >= 2 && p[j-1] == '*') && PRE[j-2];
          
        OPT[0] = false;//base case, because for i=1...n, res[i][0]=false;
        for (int i = 1; i <= n; ++i){
            for (int j = 1; j <= m; ++j){
             
                OPT[j] = ((p[j-1] == s[i-1] || p[j-1] == '.') && PRE[j-1]) ||
                            ((p[j-1] == '*' && (p[j-2] == s[i-1] || p[j-2] == '.')) && PRE[j]) ||
                            (j-2 >= 0 && p[j-1] == '*' && OPT[j-2]);
            }
            for (int j = 0; j <= m; ++j)
                PRE[j] = OPT[j];
        }
        return PRE[m];
    }


// a recursive version

    bool matchFirst(const char *s, const char *p){
        return (*p == *s || (*p == '.' && *s != '\0'));
    }

    bool isMatch(const char *s, const char *p) {
        if (*p == '\0') return *s == '\0';  //empty
   
        if (*(p + 1) != '*') {//without *
            return matchFirst(s,p)&&isMatch(s + 1, p + 1);
        } else { //next: with a *
            if(isMatch(s, p + 2)) return true;    //try the length of 0
            while ( matchFirst(s,p) )       //try all possible lengths
                if (isMatch(++s, p + 2))return true;
        }

        return false;
    } 

No comments:

Post a Comment