Microsoft Interview Question
SDE-2sTeam: Office
Country: India
Interview Type: In-Person
With hashing , complexity is O(n) where n - length of DNA sequence.
I don't think that using tries you will achieve complexity better than O(n).
What hash function did you suggest to the interviewer?
You could create suffix tree for O(n) time and check all 10-letter DNA if there is dublicate in suffix tree , but it is also O(n) approach and I don't think that interviewer expect to use suffix tree approach for 45 minutes
Just a comment. I was thinking a little bit on the approach you suggest. I could speculate on what happens.
You are going to create suffix trie, but a trie with keywords which are first 10 letters from each suffix in DNA sequence. You are going to generate iterative, in sense first check if 10 -letter prefix of given suffix is in trie, if so return it as dublicate , if it is not dublicate you will add it to suffix trie ( trie from all 10 prefix of all suffixes of DNA). Complexity is again O(n) and it is dubious if it is better solution than hashing.
Using the suffix tree logic in C++ :
///
#include <iostream>
#include <unordered_map>
using namespace std;
struct Node
{
char ch_;
unordered_map<char, Node*> children_;
int count_; // indicates occurance
Node(char c):ch_(c), count_(0) {}
};
int maxDepth = 0;
int maxCount = 0;
string targetStr;
void insertIntoSuffixTree(Node* root, string& str, string& originalSubStr, int level = 0)
{
if (10 == level && 0 < root->count_)
{
// reached the leaf node
maxDepth = level;
maxCount = root->count_;
targetStr = originalSubStr.substr(0, level);
}
if (str.length() == 0)
return;
if (root->children_.count(str[0]) == 0)
{
// create a new child
Node* child = new Node(str[0]);
root->children_[str[0]] = child;
root = child;
}
else
{
Node* child = root->children_[str[0]];
child->count_++; // more than one visit
root = child;
}
string subStr = str.substr(1);
insertIntoSuffixTree(root, subStr, originalSubStr, level + 1);
}
int main()
{
string str;
getline(cin, str);
Node *root = new Node('$');
for (int i = 0; i < str.length(); i++)
{
string subStr = str.substr(i);
insertIntoSuffixTree(root, subStr, subStr);
}
cout << "Length of SubString [" << maxDepth << "]" << endl;
cout << "Occurance count [" << maxCount + 1 << "]" << endl;
cout << "SubString [" << targetStr << "]" << endl;
}
\\\
Using suffix tree in C++ :
#include <iostream>
#include <unordered_map>
using namespace std;
struct Node
{
char ch_;
unordered_map<char, Node*> children_;
int count_; // indicates occurance
Node(char c):ch_(c), count_(0) {}
};
int maxDepth = 0;
int maxCount = 0;
string targetStr;
void insertIntoSuffixTree(Node* root, string& str, string& originalSubStr, int level = 0)
{
if (10 == level && 0 < root->count_)
{
// reached the leaf node
maxDepth = level;
maxCount = root->count_;
targetStr = originalSubStr.substr(0, level);
}
if (str.length() == 0)
return;
if (root->children_.count(str[0]) == 0)
{
// create a new child
Node* child = new Node(str[0]);
root->children_[str[0]] = child;
root = child;
}
else
{
Node* child = root->children_[str[0]];
child->count_++; // more than one visit
root = child;
}
string subStr = str.substr(1);
insertIntoSuffixTree(root, subStr, originalSubStr, level + 1);
}
int main()
{
string str;
getline(cin, str);
Node *root = new Node('$');
for (int i = 0; i < str.length(); i++)
{
string subStr = str.substr(i);
insertIntoSuffixTree(root, subStr, subStr);
}
cout << "Length of SubString [" << maxDepth << "]" << endl;
cout << "Occurance count [" << maxCount + 1 << "]" << endl;
cout << "SubString [" << targetStr << "]" << endl;
}
If you tweak the problem and replace A, T, G and C by 0, 1, 2, 3 problem is comparatively easy.
Problem reduces to finding repeated 10 digit number. Adding one digit and removing first digit in a number is relatively much easier than adding new char to string and computing hash code each time.
Here is the code. Binary numbers are much faster to manipulate than number in base 10.
Each character is taking 2 bits here:
public Set<String> findRepeated(String s) {
if (s == null || s.length() < 10) {
return Collections.emptySet();
}
Map<Character, Integer> map = new HashMap<>();
map.put('A', 0);
map.put('C', 1);
map.put('G', 2);
map.put('T', 3);
Set<Integer> visited = new HashSet<>();
Set<String> result = new HashSet<>();
int mask = (1 << 20) -1; //111...11 (20 times)
int number = 0;
for (int i = 0; i < s.length(); i++) {
if (i < 9) {
number = (number << 2) + map.get(s.charAt(i));
} else {
number = (number << 2) + map.get(s.charAt(i));
//make sure number is only 20 bits
number = number & mask;
if (visited.contains(number)) {
result.add(s.substring(i - 9, i + 1));
}
visited.add(number);
}
}
return result;
}
void find_dup(string ststr)
{
map <string , int> msstr;
string sstr = ststr.substr(0,10);
int i=1;
while(sstr.size() == 10)
{
msstr[sstr]++;
sstr = ststr.substr(i,10);
i++;
}
map <string , int> :: iterator msit = msstr.begin();
for( ; msit != msstr.end(); msit++)
{
if((*msit).second > 1 )
cout << " String : " << (*msit).first << " count " << (*msit).second << endl;
}
}
package interview;
import java.util.*;
public class countDuplicate {
public void count(String a){
Set<Character> set = new TreeSet<Character>();
char[] c = a.toCharArray();
for(int i=0;i<c.length;i++){
int cnt = 1;
for(int j=1;j<c.length;j++){
if(c[i] == c[j]){
cnt++;
//System.out.println(c[i] + cnt);
}
if(cnt == 10){
set.add(c[i]);
continue;
}
}
}
System.out.println(set);
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String s = "AAAADSSSDSSSSSSSASSAAAADEDAAAAA";
countDuplicate cnt = new countDuplicate();
cnt.count(s);
}
}
package interview;
import java.util.*;
public class countDuplicate {
public void count(String a){
Set<Character> set = new TreeSet<Character>();
char[] c = a.toCharArray();
for(int i=0;i<c.length;i++){
int cnt = 1;
for(int j=1;j<c.length;j++){
if(c[i] == c[j]){
cnt++;
//System.out.println(c[i] + cnt);
}
if(cnt == 10){
set.add(c[i]);
continue;
}
}
}
System.out.println(set);
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String s = "AAAADSSSDSSSSSSSASSAAAADEDAAAAA";
countDuplicate cnt = new countDuplicate();
cnt.count(s);
}
}
Use an integer array of size 4.
Since we know it's just 4 characters, we can map them to 0 1 2 3 positions respectively.
Have 3 pointers. previous, current, next.
If all the 3 are the same, enter a switch case to increment the count. if not reset to 0. I'm assuming that the solution needed is to get the consecutively duplicated character having a length 10 or more.
O(N) time where N is the number of the characters in the dna sequence
O(1) space since an integer array of size 4 is used.
char getDuplicateCharacter() {
String dna = "AAAAAATTTTAGGCCCCTTTTTGGGGGCCAAAAAAAAAAAAGGGCTA"
char[] ch = dna.toCharArray();
int[] chars = new int[4];
if(ch[0] == 'A')
chars[0]++;
else if(ch[0] == 'T')
chars[0]++;
else if(ch[0] == 'G')
chars[2]++;
else//c
chars[3]++;
for(int i=1; i<ch.length-1; i++) {
switch(ch[i]) {
case 'A' :
if(ch[i-1] == 'A' && ch[i+1] == 'A')
chars[0]++;
else
chars[0] = 0;
if(chars[0] == 10)
return 'A';
break;
case 'T':
if(ch[i-1] == 'T' && ch[i+1] == 'T')
chars[1]++;
else
chars[1] = 0;
if(chars[1] == 10)
return 'T';
break;
case 'G':
if(ch[i-1] == 'G' && ch[i+1] == 'G')
chars[2]++;
else
chars[2] = 0;
if(chars[2] == 10)
return 'G';
break;
case 'C':
if(ch[i-1] == 'C' && ch[i+1] == 'C')
chars[2]++;
else
chars[2] = 0;
if(chars[2] == 10)
return 'C';
break;
}
}
}
String dna = "AAAAAATTTTAGGCCCCTTTTTGGGGGCCAAAAAAAAAAAAGGGCTA"
char[] ch = dna.toCharArray();
int[] chars = new int[4];
if(ch[0] == 'A')
chars[0]++;
else if(ch[0] == 'T')
chars[0]++;
else if(ch[0] == 'G')
chars[2]++;
else//c
chars[3]++;
for(int i=1; i<ch.length-1; i++) {
switch(ch[i]) {
case 'A' :
if(ch[i-1] == 'A' && ch[i+1] == 'A')
chars[0]++;
else
chars[0] = 0;
if(chars[0] == 10)
return 'A';
break;
case 'T':
if(ch[i-1] == 'T' && ch[i+1] == 'T')
chars[1]++;
else
chars[1] = 0;
if(chars[1] == 10)
return 'T';
break;
case 'G':
if(ch[i-1] == 'G' && ch[i+1] == 'G')
chars[2]++;
else
chars[2] = 0;
if(chars[2] == 10)
return 'G';
break;
case 'C':
if(ch[i-1] == 'C' && ch[i+1] == 'C')
chars[2]++;
else
chars[2] = 0;
if(chars[2] == 10)
return 'C';
break;
}
}
String dna = "AAAAAATTTTAGGCCCCTTTTTGGGGGCCAAAAAAAAAAAAGGGCTA"
char[] ch = dna.toCharArray();
int[] chars = new int[4];
if(ch[0] == 'A')
chars[0]++;
else if(ch[0] == 'T')
chars[0]++;
else if(ch[0] == 'G')
chars[2]++;
else//c
chars[3]++;
for(int i=1; i<ch.length-1; i++) {
switch(ch[i]) {
case 'A' :
if(ch[i-1] == 'A' && ch[i+1] == 'A')
chars[0]++;
else
chars[0] = 0;
if(chars[0] == 10)
return 'A';
break;
case 'T':
if(ch[i-1] == 'T' && ch[i+1] == 'T')
chars[1]++;
else
chars[1] = 0;
if(chars[1] == 10)
return 'T';
break;
case 'G':
if(ch[i-1] == 'G' && ch[i+1] == 'G')
chars[2]++;
else
chars[2] = 0;
if(chars[2] == 10)
return 'G';
break;
case 'C':
if(ch[i-1] == 'C' && ch[i+1] == 'C')
chars[2]++;
else
chars[2] = 0;
if(chars[2] == 10)
return 'C';
break;
}
}
We could hash all substrings with length 10 and later will find dublicates using hash values
- EPavlova March 13, 2016