Google Interview Question
Software EngineersCountry: United States
We can pre-process the file, building word => positions_in_file hash map. Time complexity of this step: O(N) (where N is number of words in the file, and we assume there is a constant limit on length of any word).
At the query time (the code below), we combine positions of the needed words, sort them, and find the smallest window using those positions. The worst case time complexity: O(N log N).
#include <iostream>
#include <vector>
#include <unordered_map>
#include <unordered_set>
using namespace std;
pair<int, int> MinWindow(unordered_map<string, vector<int>> const &word_positions,
vector<string> const &words_to_find)
{
unordered_set<string> unique_words_to_find;
for (auto const &word : words_to_find) {
unique_words_to_find.insert(word);
}
vector<int> positions;
unordered_map<int, string> word_at_position;
for (auto const &word : unique_words_to_find) {
auto it = word_positions.find(word);
if (it == word_positions.end()) {
return pair<int, int>(-1, -1);
}
vector<int> const &word_positions = it->second;
positions.insert(positions.end(), word_positions.begin(), word_positions.end());
for (int pos : word_positions) {
word_at_position[pos] = word;
}
}
if (positions.size() < words_to_find.size()) {
return pair<int, int>(-1, -1);
}
sort(positions.begin(), positions.end());
unordered_map<string, int> pattern_hash, hash;
for (string const &word : words_to_find) {
++pattern_hash[word];
}
pair<int, int> min_window = pair<int, int>(-1, -1);
int start = 0;
int words_matched_count = 0;
for (int i = 0; i < positions.size(); ++i) {
int pos = positions[i];
string const &word = word_at_position[pos];
++hash[word];
if (hash[word] <= pattern_hash[word]) {
++words_matched_count;
}
if (words_matched_count == words_to_find.size()) {
while (true) {
string const &word_at_start = word_at_position[positions[start]];
if (hash[word_at_start] > pattern_hash[word_at_start]) {
++start;
--hash[word_at_start];
} else {
break;
}
}
int window_size = positions[i] - positions[start] + 1;
if (min_window.first == -1 ||
window_size < min_window.second - min_window.first + 1)
{
min_window = pair<int, int>(positions[start], positions[i]);
}
}
}
return min_window;
}
int main()
{
unordered_map<string, vector<int>> word_positions;
word_positions["cat"] = {1, 8, 45};
word_positions["rat"] = {5, 27};
word_positions["bat"] = {64, 128};
pair<int, int> window = MinWindow(word_positions, {"cat", "rat", "bat"});
cout << window.first << "->" << window.second << "\n";
return 0;
}
Here is a working C++11 solution with some tests. I didn't cover any special cases apart form the query not being found at all. I assume that all words in query are different.
The solution is based on the sliding window approach and is related to Rabin-Karp algorithm, with the difference of using a window that varies in size.
I first reduced the original text by removing all words that do not appear in the query. This step is not necessary, but in reduces the runtime (we can assume that it will have way more unique words than the query) and makes the code somewhat nicer with the cost of saving two additional arrays.
After that I go through all possible window sizes and for each size I initially create a hash map containing the number of occurrences of each word. After that in order to recalculate the hash map for the next window I simply have to remove the "oldest" element and add a new one, which is the idea from Rabin-Karp and brings the biggest speedup. As the hash map contains only element which occur in current window it is sufficient to simply compare its size with size of the query to determine if the current window contains all words from the query.
Let's say that:
L - total number of words in original text
Q - number of words in query
R - number of words in original text after removing all the words that do not appear in the query
W - longest word in original text (for calculating hash)
V - Longest word in query (for calculating hash)
Time complexity:
Preparing for shortening: O(Q*V)
Shortening text: O(L*W)
Calculating: O(R^2 * V)
Space complexity:
Dictionary: O(Q*V)
Shortened text: O(R*V) (not mandatory)
Hash: O(Q*V)
#include <iostream>
#include <string>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <utility>
#include <limits>
std::pair<int, int> smallest_window(const std::vector<std::string>& original, const std::vector<std::string>& query) {
std::unordered_set<std::string> query_check; // could've also been a trie
for(const std::string& word : query) query_check.insert(word);
// reduce the long text by removing all words which do not exist in the query
std::vector<std::string> reduced;
std::vector<int> red_orig;
for(int i = 0; i < original.size(); ++i) {
if(query_check.find(original[i]) != query_check.end()) {
reduced.push_back(original[i]);
red_orig.push_back(i);
}
}
std::pair<int, int> res = {-1, -1};
int min_window = std::numeric_limits<int>::max();
for(int window_size = query.size(); window_size <= reduced.size(); ++window_size) {
std::unordered_map<std::string, int> content;
// fill up the initial content
for(int i = 0; i < window_size; ++i) {
if(content.find(reduced[i]) == content.end()) content[reduced[i]] = 1;
else ++content[reduced[i]];
}
// now move the window until a match is found
int window_start = 0;
while (window_start <= reduced.size() - window_size) {
if(content.size() == query.size()) { // found a window!
if(red_orig[window_start + window_size - 1] - red_orig[window_start] < min_window) {
res = {red_orig[window_start], red_orig[window_start + window_size - 1]};
min_window = red_orig[window_start + window_size - 1] - red_orig[window_start];
}
} else { // move window
--content[reduced[window_start]]; // removing word furthest to the left
if(0 == content[reduced[window_start]]) content.erase(reduced[window_start]);
if(content.find(reduced[window_start + window_size]) == content.end()) { // adding new word
content[reduced[window_start + window_size]] = 1;
} else {
++content[reduced[window_start + window_size]];
}
}
++window_start;
}
}
return res;
}
int main() {
{
std::vector<std::string> original = {"aa", "ab", "ac", "ad", "ae", "ba", "bb", "bc", "bd", "be", "ca", "cb", "cc", "cd", "ce", "da", "db", "dc", "dd", "de"};
std::vector<std::string> query = {"db", "ae", "bc"};
std::pair<int, int> res = smallest_window(original, query);
std::cout << "Expected: 4 (\"ae\") : 16 (\"db\")" << std::endl;
std::cout << "Result : " << res.first << " (\"" << original[res.first] << "\") : " << res.second << " (\"" << original[res.second] << "\")" << std::endl;
}
{
std::vector<std::string> original = {"aa", "ab", "ac", "ad", "aa"};
std::vector<std::string> query = {"aa", "ad"};
std::pair<int, int> res = smallest_window(original, query);
std::cout << "Expected: 3 (\"ad\") : 4 (\"aa\")" << std::endl;
std::cout << "Result : " << res.first << " (\"" << original[res.first] << "\") : " << res.second << " (\"" << original[res.second] << "\")" << std::endl;
}
{
std::vector<std::string> original = {"aa", "ab", "ac", "ad", "aa"};
std::vector<std::string> query = {"ac"};
std::pair<int, int> res = smallest_window(original, query);
std::cout << "Expected: 2 (\"ac\") : 2 (\"ac\")" << std::endl;
std::cout << "Result : " << res.first << " (\"" << original[res.first] << "\") : " << res.second << " (\"" << original[res.second] << "\")" << std::endl;
}
{
std::vector<std::string> original = {"aa", "ab", "ac", "ad", "aa"};
std::vector<std::string> query = {"aa", "af"};
std::pair<int, int> res = smallest_window(original, query);
std::cout << "Expected: no pair found!" << std::endl;
std::cout << "Result : ";
if(-1 == res.first) std::cout << "no pair found!" << std::endl;
else std::cout << "pair found" << std::endl;
}
return 0;
}
1) For string matching
a) Create a trie consisting of the query keywords. The match is considered when the word in the string match a complete trie path.
2) Read the file sentence by sentence. Tokenize the readLine, and then pass the word into the trie for matching.
class range {
int start;
int end;
};
class current_smallest_window {
int sz;
list<range> rg; // potentiall have multiple smallest ranges
};
class smallest_window {
ifstream ifile;
trie trie_f;
set<string> query_words;
int start_index;
unordered_set<string> match_set;
current_smallest_window csw;
smallest_window(const char *file_path, const char *query) :
ifile(file_path), start_index(-1) {
// read query words into the vector
}
void build_trie() {
// iterate over the query words and create the trie
}
void process_file_for_range() {
char line[MAX_LINE_SIZE];
int count = 0;
while(1) {
ifile.getline(line, MAX_LINE_SIZE);
if(!ifile.good()) {
if (!ifile.eof()) {
throw exception();
}
}
//
count++;
vector<string> tokens;
get_line_tokens(line, token);
if (!token.empty()) {
// pass these to the trie for matching
set<string> current_match_set = get_match_tokens_from_trie(tokens);
if(current_match_set.empty()) {
continue;
}
if(current_match_set.size() == query_words.size()) {
// all found
insert into current_smallest_window
continue;
}
// iterate over the current_match set and find
// if the current_match consist of everything included in the match_set
// then update the start_index and match_set = current_match_set,
// else ( current match may have new matches but not all the existing matches)
// stay with the same start _index, update the match_set with the new match from the
// current match set
}
}
}
};
- Dhruva.Bharadwaj May 21, 2017