Google Interview Question
Software Engineer / DevelopersCountry: United States
Interview Type: In-Person
Just add an additional condition -> if the string is empty and pattern has only has only '*'s.
Should * be handled by:
isMatching(i,j) = isMatching(i+1,j) or isMatching(i+1,j+1) or isMatching(i,j+1)
So it covers the case string is empty and pattern has only "*".
You can solve the problem using the dynamic programming technique.
Construct the matrix match, where match[i][j] is true iff the first i characters of a match the first j characters of b.
The recurrence relation is the following:
match[i][j] =
(match[i-1][j-1] && character_matches(a[i - 1], b[j - 1])) ||
/* eg: (abc, a?c) => (abcd, a?cd) */
(match[i-1][j] && b[j - 1] == '*') ||
/* eg: (ab, a*) => (abc, a*) */
(match[i][j-1] && b[j - 1] == '*')
/* eg: (abc, ab) => (abc, ab*) */
And the basic case is: match[0][0] = true
Time complexity: O(A*B),
Space complexity: O(A*B), can be reduced to O(A+B)
where A,B = length of A, respectively B
Here is the code:
#include <iostream>
using namespace std;
bool isCharMatch(char a, char b) {
return (b == '?') || (b == '*') || (a == b);
}
// match[i][j] = (match[i-1][j-1] && matches(a[i - 1], b[j - 1])) ||
// (match[i-1][j] && b[j - 1] == '*') ||
// (match[i][j-1] && b[j - 1] == '*')
bool isMatch(const string& a, const string& b) {
bool match[a.length() + 1][b.length() + 1];
for (int i = 0; i <= a.length(); ++i)
for (int j = 0; j <= b.length(); ++j) {
match[i][j] = (i == 0) && (j == 0);
if (i > 0 && j > 0)
match[i][j] |= match[i-1][j-1] && isCharMatch(a[i-1], b[j-1]);
if (i > 0 && j > 0)
match[i][j] |= match[i-1][j] && b[j-1] == '*';
if (j > 0)
match[i][j] |= match[i][j-1] && b[j-1] == '*';
}
return match[a.length()][b.length()];
}
void test(const string& a, const string& b) {
cout << "match(" << a << ", " << b << ") = " << isMatch(a, b) << endl;
}
int main() {
test("abab", "*b*");
test("abab", "a**b");
test("abab", "a**b");
test("", "");
test("ab", "");
test("ab", "*");
test("", "**");
test("", "*?");
return 0;
}
Modification of Knuth-Morris-Pratt String matching algorithm.
Time Complexity = O(m+n) where m is the string length and n is the pattern length
public class StringMatchingWithRegexPattern {
/**
* The prefix function π for a pattern encapsulates knowledge about how the pattern matches against shifts of itself.
*/
public static int[] preProcessing(String pattern) {
int index = 0, shift = -1;
int[] prefix = new int[pattern.length() + 1];
prefix[0] = -1;
while (index < pattern.length()) {
if(pattern.charAt(index) == '?' || pattern.charAt(index) == '*'){
if(index == pattern.length() - 1){ // handles the last 'abc*' or 'c?' cases
prefix[index] = ++shift;
break;
}
index++;
}
/**
* String: b a c b a b a b a a b c b a b
* Pattern: a b a b a c a (mismatch between a and c)
* Pattern: a b a b a c Then shift 2 character right
*/
while (shift >= 0 && pattern.charAt(index) != pattern.charAt(shift)) { // if there is mismatch consider next widest border
shift = prefix[shift];
}
index++;
shift++;
prefix[index] = shift;
}
return prefix;
}
public static boolean matching(String text, String pattern) {
// Generate prefix for pattern
int[] prefix = preProcessing(pattern);
int index = 0, charMatched = 0;
while (index < text.length()) {
while(pattern.charAt(charMatched) == '*' || pattern.charAt(charMatched) == '?'){ // while loop for handles **b / ?*b cases
if(pattern.charAt(charMatched)== '*'){
index = text.length() - (pattern.length() - charMatched);
}
index++; // increment for both cases * and ?
charMatched++;
if (charMatched == pattern.length()) {// if their is "*"
return true;
}
}
while (index < text.length() && charMatched >= 0 && text.charAt(index) != pattern.charAt(charMatched)) {
charMatched = prefix[charMatched];
}
index++;
charMatched++;
if (charMatched == pattern.length()) {
return true;
}
}
if(text.length() == 0 && pattern.length() == 1 && pattern.charAt(0) == '*' || pattern.charAt(charMatched) == '?'){ // str:'' ptrn:'*'
return true;
}
return false;
}
public static void main(String[] args) {
System.out.println(matching("ababab","ab*b"));
System.out.println(matching("abab","abab"));
System.out.println(matching("ababab","a**b"));
System.out.println(matching("","*"));
System.out.println(matching("aaaaaab","*?*b"));
}
}
Java Implementation of string matcher using dynamic programming...
public class StringMatcher {
static class Pairs{
int mIndex, pIndex;
Pairs(int mIndex, int pIndex){
this.mIndex = mIndex;
this.pIndex = pIndex;
}
public boolean equals(Object obj) {
if (!(obj instanceof Pairs)) {
return false;
} else {
Pairs that = (Pairs)obj;
return this.mIndex == that.mIndex && this.pIndex == that.pIndex;
}
}
public int hashCode() {
return (mIndex + pIndex)* 31;
}
}
Map<Pairs, Boolean> matchMemorize = null;
String matcher = null;
String pattern = null;
public void initializer(String matcher, String pattern){
this.matchMemorize = new HashMap<Pairs, Boolean>();
this.matcher = matcher;
this.pattern = pattern;
}
public boolean isStringMatches(int mIndex, int pIndex){
if(mIndex == matcher.length() && pIndex == pattern.length()){
return true;
}
else if(mIndex == matcher.length() || pIndex == pattern.length()){
return false;
}
else{
Pairs pairs = new Pairs(mIndex, pIndex);
if(matchMemorize.containsKey(pairs)){
return matchMemorize.get(pairs);
}
else {
boolean result = false;
if(pattern.charAt(pIndex) != '*' && pattern.charAt(pIndex) != '?'){
if(matcher.charAt(mIndex) == pattern.charAt(pIndex)){
result = isStringMatches(mIndex+1, pIndex+1);
}
}
else if(pattern.charAt(pIndex) == '*'){
result = isStringMatches(mIndex+1, pIndex) || isStringMatches(mIndex, pIndex+1) || isStringMatches(mIndex+1, pIndex+1);
}
else { // for ? case
result = isStringMatches(mIndex+1, pIndex+1);
}
Pairs newPair = new Pairs(mIndex, pIndex);
matchMemorize.put(newPair, result);
return result;
}
}
}
public static void main(String[] args) {
StringMatcherUsingDP matcher = new StringMatcherUsingDP();
matcher.initializer("ababab","ab*b");
System.out.println(matcher.isStringMatches(0, 0));
matcher.initializer("abab","abab");
System.out.println(matcher.isStringMatches(0, 0));
matcher.initializer("ababab","a**b");
System.out.println(matcher.isStringMatches(0, 0));
matcher.initializer("","*");
System.out.println(matcher.isStringMatches(0, 0));
matcher.initializer("aaaaaab","*?*b");
System.out.println(matcher.isStringMatches(0, 0));
}
}
In python we can write in simple
in pattern * means is repeated zero or more times (allowing a pattern to repeat zero times means it does not need to appear at all to match).
and pattern ? means the pattern appears zero or one time.
import re
def isMatching(a,b):
regexe=re.compile(b)
if regexe.search(a):
return true
else:
return false
looks like it is a case of implementing a turing machine for language processing. You have states and events where events would be the character seen.
You are right in the sense that one needs to build an automaton for processing this. However, a regular language can be processed using an NFA and one needs to build an NFA to solve this problem. Turing machine can be used to process 'recursively enumerable' languages which are at the top of the hierarchy as defined by Noam Chomsky.
You can also think of the first string being stored in a TST structure and use the second string to search whether a string with that pattern is found in the TST. The search for '?' is trivial for TST. The '*' needs a little bit logic where all characters are skipped over until after that character after '*'. So for example, isMatching("ababab","ab*b") . See 'a', then 'b', then see '*' and character after '*' is 'b', so skip over 'a' and see 'b' but then there is still "ab" remaining but the 2nd string already finish. So this is not a solution, back track one step and continue to skip over 'b', 'a', then see another 'b'. This would be the end of string 1 and string 2. Then this would return true.
I suppose that no interviewer is expecting a truly efficient regular expression matcher. One improvement to your approach is to mark the states we already processed. The search state can be described by the position i in the search string and position j in the pattern string from which we are trying to find the matching.
This leads to a O(|S| * |P|) algorithm.
const int MAXS = 1000, MAXP = 100;
string s, p; // text, pattern
int slen, plen;
char visited[MAXS][MAXP];
char go(int i, int j) {
if (visited[i][j]) // already processed this pair (i,j)
return visited[i][j];
if (i == slen) {
// we reached the end of the text, remaining chars of the pattern must be '*'
for ( ; j < plen; j++)
if (p[j] != '*')
return visited[i][j] = -1;
return visited[i][j] = 1;
}
if (j == plen) // end of the pattern but there are still chars in S
return visited[i][j] = -1;
if (p[j] != '*') {
if (p[j] != '?' && p[j] != s[i])
return visited[i][j] = -1;
return visited[i][j] = go(i+1, j+1);
}
// '*' can consume any number of characters
if ( (go(k, j+1) == 1) || (go(k+1, j) == 1) )
return visited[i][j] = 1;
return visited[i][j] = -1;
}
int main() {
getline(cin, s);
getline(cin, p);
slen = s.size(), plen = p.size();
cout << (go(0,0) == 1) << endl;
return 0;
}
In below c++ code , for checking it you need to copy code and just run, without any change,
In input you put simple string as first string and pattern for second string, then program give output true or false
code:
#include<iostream>
#include<string.h>
using namespace std;
bool check(string s1,string s2)
{
int l1 =s1.length(),l2 = s2.length();
if(l1==0 && l2==0)
{
return true;
}
else if(l1==1 && l2==0)
{
return false;
}
else if(l1==0 && l2==1)
{
if((int)s2[0] > 96)
{
return false;
}
else
{
return true;
}
}
else if(l1==0 && l2>1)
{
if((int)s2[0] > 96)
{
return false;
}
else
{
return check(s1,s2.substr(1,l2));
}
}
if(l1==1 && l2==1)
{
if((int)s2[0] > 96)
{
if(s1[0]==s2[0])
{
return true;
}
else
{
return false;
}
}
else
{
return true;
}
}
int t = (int)s2[0];
if(t>96)
{
if(s1[0]==s2[0])
{
return check(s1.substr(1,l1), s2.substr(1,l2));
}
else
{
return false;
}
}
else if(t==42)
{
return (check(s1.substr(1,l1),s2) | check(s1, s2.substr(1,l2)));
}
else if(t==63)
{
return (check(s1,s2.substr(1,l2)) | check(s1.substr(1,l1), s2.substr(1,l2)));
}
return false;
}
int main()
{
//* = 42, ?=63
while(1)
{
string s1,s2;
cout<<"Enter first string(Base): ";
cin>>s1;
cout<<"Enter second string(Pattern): ";
cin>>s2;
int t=check(s1,s2);
if(t==1)
{
cout<<"\nOutput: True\n\n";
}
else
{
cout<<"\nOutput: False\n\n";
}
}
return 0;
}
bool IsOnlyAsterisk(std::wstring Pat)
{
std::wstring::iterator Itr = Pat.end();
for(Itr = Pat.begin();
Itr != Pat.end() && *Itr == L'*';++Itr);
return (Itr == Pat.end());
}
bool Match(std::wstring Str,std::wstring Pat)
{
if(Str.empty())
{
return (Pat.empty() || IsOnlyAsterisk(Pat));
}
if(Pat.empty())
{
return (false);
}
if(Pat[0] == L'?')
{
return Match(Str.substr(1),Pat.substr(1));
}
if(Pat[0] == L'*')
{
return (Match(Str.substr(1),Pat) || Match(Str.substr(1),Pat.substr(1)) || Match(Str,Pat.substr(1)));
}
return ((Pat[0] == Str[0]) &&
Match(Str.substr(1),Pat.substr(1)));
}
int main()
{
bool Res = Match(L"abcab",L"a*?*c*?*b*");
cout << Res << endl;
}
The best way to solve such a problem is to first build a DFA out of the expression. Then after eliminating obvious redundant states, you get the most efficient program that can match an input against the state.
Of course for small expressions you can directly write a program by simply going back n forth in the input to check for a match.
There are a number of backtracking quadratic solutions proposed, but it seems to me there is a linear solution with no backtracking.
I'll assume the pattern string starts and ends with an asterisk (we can easily reduce the general case to this, by first verifying that the start and end of the alphanumeric string disjointly match the portions of the pattern string before and after any asterisks, respectively, so as to only have to deal with the remaining portions of the pattern and alphanumeric strings).
Note that, conceptually, we might as well treat any repeated asterisks in the pattern string as just a single asterisk. Thus, the pattern string amounts to a sequence of specific texts (possibly with ?s in them, but this is no big deal to match against), separated by *s. That is, specific texts which must show up in a specific order, but no further constraints.
We can simply match this greedily, in linear time: run through the alphanumeric string looking for the first instance of the first text; once you've found that, keep running through the alphanumeric string from where you left off till you find the first instance of the second text; once you've found that, keep running through from where you left off till you find the first instance of the third text, etc. If you eventually find instances of every text, hooray, there's a match; if you hit the end of the alphanumeric string before doing so, boo (or hooray), there's no match.
Regular Expression matchers work in linear time. Since this problem uses simple RE, there is also a linear time solution. However, these algorithms are complex. I don't think any interviewer is expecting a candidate to be able to code this in an interview.
I don't think that simple approach works. Consider this example
s: "abcabdcd"
p: "ab*d"
you'll greedily match the 'd' in the pattern with the first 'd' in the string, which is incorrect. even if you make a quick fix for this case, i think it still won't work in more complex cases where '*' shall skip several characters (not necessarily the first match)
I assume this is in response to my answer? As I explained, first you have to reduce to the case where the pattern starts and ends with asterisks; THEN you can be greedy (in the appropriate sense). The parts of the pattern before and after asterisks have to be dealt with specially, but easily.
So for your example, first we pull the initial ab and the final d out of s and p, and the problem reduces to greedily matching "cabdc" against "*", which works out straightforwardly.
Put another way, instead of thinking of reducing to the case where the pattern string starts and stops with asterisks, we can think of the pattern and alphanumeric strings as both implicitly beginning with a special "START" character and ending with a special "STOP" character.
Then the algorithm is just:
If the pattern string is empty, return true or false based on whether the alphanumeric string is empty.
If the pattern string is non-empty, extract from the pattern string the component prior to any asterisks, and chomp through the alphanumeric string till finding the first match for that component (returning false if you hit the end without finding such a match). Then continue matching the remainder of the pattern against the remainder of the alphanumeric string.
Finally, if the pattern string starts with asterisks, throw them out, and match the rest of the pattern string against the alphanumeric string.
(I'd write this into my answer, but apparently answers can't be edited?)
Yes, the reply was for you, my bad.
The point is that the matching between the blocks in the pattern without '*' and the given text is not easily done in O(N), but it's possible of course.
Example: "aaabaaaabaabaaaaaabacacacaacc" and "*aaaaab*aac*". To match the subblocks "aaaaab" and "aac" you'll need a sort of needle in a haystack algorithm like KMP or Aho-Corasick to keep it O(N) and not O(N*P), thus contradicting the "simply" and "greedily" in "We can simply match this greedily,"
Yeah, that's fair. But, even if one were not willing or able to do the full KMP thing right away, one could at least write up the code in a modularized way such that the only part to worry about improving later is the findFirstInstanceOf(string to find, string to find it in) function, and with no backtracking to deal with anywhere else.
The findFirstInstanceOf thing could, of course, be made into KMP or any other such efficient thing, and then the whole thing would be O(N). Even if findFirstInstanceOf were done "naively", though, this methodology would avoid wasting a lot of time in unnecessarily backtracking to look for second, third, etc. instances of the subblocks before declaring that there is no match. It's _that_ unnecessary backtracking which I think is good to observe the uselessness of.
I think recursion if easy for understanding, but DP is butter, since I'm not good at DP, so just
the recursion way, kind of time consuming.
public static boolean compare(String s1,String s2){
if(s1==null || s2==null) return false;
int count=0;
int l1=s1.length();
int l2=s2.length();
for(int i=0;i<l2;i++){
char c = s2.charAt(i);
if(c=='*'){
int j=i+1;
if(j==l2) return true;
while(s2.charAt(j)=='*'||s2.charAt(j)=='?'){
j++;
if(j==l2)
return true;
}
char next=s2.charAt(j);
while(count<l1){
while(s1.charAt(count)!=next){
count++;
if(count==l1) return false;
}
if(count==l1) return false;
boolean flag = compare(s1.substring(count),s2.substring(j));
if(flag) return true;
count++;
}
}
else if(c!='?'){
if(s1.charAt(count)!=s2.charAt(i))
return false;
count++;
}
else
count++;
}
return true;
}
You can check codes in other posts. It's not really DP but rather mark what you already computed (a bit different because if you already computed a state (i,j) it means it failed).
So it's really about not using strings for recursion but the indices i (text) and j (pattern) currently to start from
bool Match(const char *pat, const char *str) {
std::list<const char *> curr, next;
curr.push_back(pat);
while (*str != '\0') {
while (!curr.empty()) {
const char *i = curr.front();
curr.pop_front();
if (*i == *str || *i == '?') {
next.push_back(i + 1);
} else if (*i == '*') {
next.push_back(i);
curr.push_back(i + strspn(i, "*"));
}
}
if (next.empty()) return false;
curr.swap(next);
++str;
}
for (const char *i : curr) {
if (strspn(i, "*") == strlen(i)) return true;
}
return false;
}
I took some time to make this. Seems to be working fine.
#include<stdio.h>
int isMatching(char *s, char *str)
{
int i=0,j=0,star_pos=0,star_found=0,pat=0;
while(s[i]!='\0' && str[j]!='\0')
{
if(s[i]==str[j])
{i++; j++; pat=1;}
else if(str[j]=='*')
{
star_found=1;
star_pos=j;
j++; pat=0;
}
else if(str[j]=='?') {i++;j++;}
else if (star_found)
{
j=star_pos+1;
if(pat!=1)
i++;
else
pat=0;
}
else return 0;
}
if(str[j]!='\0') return 0;
else return 1;
}
int main()
{
char s[]="eugeeulllg";
char str[]="e*eu*??l?";
int res;
res=isMatching(s,str);
printf("\nResult = %d\n",res);
return 0;
}
Any ideas?
#include <stdio.h>
#include <string.h>
#define FALSE 0
#define TRUE 1
typedef unsigned char BOOL;
typedef struct {
char *input;
char *pattern;
} test_case;
BOOL regex_match(char *s, char *p)
{
if ((!s[0] && !p[0]) ||
(!s[0] && p[0] == '*' && !p[1])) {
return TRUE;
}
if (p[0] == '?' || p[0] == s[0]) {
return regex_match(s+1, p+1);
}
if (p[0] == '*') {
return regex_match(s, p+1) || // ignore the '*' completely
regex_match(s+1, p); // '*' in pattern consumes s[0]
}
return FALSE;
}
int main()
{
int i;
test_case tc[] = {
{"abcd","abdd"},
{"abcd","abcd"},
{"abcd","ab?d"},
{"","?"},
{"ab","?*"},
{"a",""},
{"","a*b"},
{"","****"},
{"","*c"},
{"",""},
{"","*"},
{"acfdfcdcdcd","a*cd*cd"},
{"abdfpdcdcd","ab*cd"},
{"abdfpdcdcd","ab*"},
};
for (i=0; i<sizeof(tc)/sizeof(tc[0]); i++) {
if (TRUE == regex_match(tc[i].input, tc[i].pattern)) {
printf("\"%s\" matches \"%s\"\n", tc[i].input, tc[i].pattern);
} else {
printf("\"%s\" doesn't match \"%s\"\n", tc[i].input, tc[i].pattern);
}
}
return 0;
}
You can solve this recursively.
Let isMatching(i,j) denote the indices of the string and the pattern respectively (returns true or false). Basically, we'll start from the indices (0,0) of the string and the pattern and if we are able to reach (lengthOfS,lengthOfP) it means that we've successfully matched the string and the pattern(returns 1 in this case and 0 otherwise).
If the current character of the pattern is not a special character - '?' or '*' then we have the following cases:
1) It is equal to the character of the string
2) It is not equal to the character of the string
In case of (1) we can simply go ahead and check with the next character, i.e we must match isMatching(i+1,j+1). In case (2) we must return zero.
If the current character of the pattern is a '*', then it means we can match the current character of the string or we can move ahead with the next character of the pattern,i.e
isMatching(i,j) = isMatching(i+1,j) or isMatching(i+1,j+1)
If the current character of the pattern is a '?', then irrespective of the character of the string, we can move to the next character on both the pattern & the string.
isMatching(i,j) = isMatching(i+1,j+1)
But implementing the above will lead to an exponential complexity owing to the repeated computation of the substructures. So we can memoize this recursion and hence an O(N*M) solution(space and time) is possible.
Python implementation with memoization
- Just Another Indian Wannabe Coder July 31, 2013