Given a large file with senten

Google Interview Question for Software Engineers

0

of 0 votes

5
Answers
Given a large file with sentences and query string, design a system (Class, data structs, functions, etc) and algorithm to return the smallest window (start and end offsets) in the input file where the query words (in any order) are seen in the text file. What is the time complexity?
- Ray May 01, 2017 in United States | Report Duplicate | Flag | PURGE
Google Software Engineer System Design

Email me when people comment.

An error occurred in subscribing you.

Country: United States

Email me when people comment.

An error occurred in subscribing you.

Comment hidden because of low score. Click to expand.

of 1 vote

package google;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.PriorityQueue;

public class SmallestWindowContainingAllWordsInQuery {
	public static class Range implements Comparable<Range> {
		int start;
		int end;
		Integer size;

		public Range(int start, int end) {
			super();
			this.start = start;
			this.end = end;
			size = end - start + 1;
		}

		@Override
		public String toString() {
			return start + " " + end;
		}

		@Override
		public int compareTo(Range arg0) {
			return this.size.compareTo(arg0.size);
		}
	}

	public static void main(String args[]) {
		HashSet<String> query = new HashSet<String>();
		String input = "This is a test string. Insert word. Insert word file. Insert String. Insert works. I hope this test works well instead of an actual file. I'm not really sure if this would make a good test";
		query.add("test");
		query.add("word");
		query.add("works");
		query.add("file");

		System.out.println(findSmallestWindow(input, query));
	}

	private static Range findSmallestWindow(String input, HashSet<String> query) {
		HashMap<String, ArrayList<Integer>> queryStringOccurences = new HashMap<String, ArrayList<Integer>>();
		Iterator<String> it = query.iterator();
		while (it.hasNext()) {
			queryStringOccurences.put(it.next().toLowerCase(), new ArrayList<Integer>());
		}

		String word = "";
		int wordCount = 0;
		for (int i = 0; i < input.length(); i++) {
			if (word != "" && (input.charAt(i) == ' ' || input.charAt(i) == ',' || input.charAt(i) == '.')) {
				wordCount++;
				if (queryStringOccurences.containsKey(word.toLowerCase())) {
					queryStringOccurences.get(word.toLowerCase()).add(wordCount);
				}
				word = "";
			} else if (Character.isAlphabetic(input.charAt(i))) {
				word += input.charAt(i);
			}
		}

		wordCount++;
		if (queryStringOccurences.containsKey(word.toLowerCase())) {
			queryStringOccurences.get(word.toLowerCase()).add(wordCount);
		}

		int size = Integer.MAX_VALUE;
		String minOccurWord = null;
		boolean noOccurence = false;
		for (String cur : queryStringOccurences.keySet()) {
			if (queryStringOccurences.get(cur).isEmpty()) {
				noOccurence = true;
				break;
			}
			if (size > queryStringOccurences.get(cur).size()) {
				size = queryStringOccurences.get(cur).size();
				minOccurWord = cur;
			}
		}

		if (noOccurence) {
			return null;
		}

		PriorityQueue<Range> finalRanges = new PriorityQueue<Range>();
		for (int occurence1 : queryStringOccurences.get(minOccurWord)) {
			PriorityQueue<Range> ranges = new PriorityQueue<Range>();
			ranges.add(new Range(occurence1, occurence1));
			for (String cur : queryStringOccurences.keySet()) {
				if (cur.equals(minOccurWord)) {
					continue;
				}
				PriorityQueue<Range> newRanges = new PriorityQueue<Range>();
				for (Range range : ranges) {
					int left = -1;
					int right = Integer.MAX_VALUE;
					boolean foundInBetween = false;
					for (int occurence2 : queryStringOccurences.get(cur)) {
						if (range.start < occurence2 && range.end > occurence2) {
							foundInBetween = true;
							break;
						}
						if (range.start - occurence2 > 0 && (range.start - occurence2) < (range.start - left)) {
							left = occurence2;
						}

						if (occurence2 - range.end > 0 && (occurence2 - range.end) < (right - range.end)) {
							right = occurence2;
						}
					}
					if (!foundInBetween) {
						if (left != -1) {
							newRanges.add(new Range(left, range.end));
						}

						if (right != Integer.MAX_VALUE) {
							newRanges.add(new Range(range.start, right));
						}
					} else {
						newRanges.add(new Range(range.start, range.end));
					}
				}
				ranges = newRanges;
			}
			finalRanges.add(ranges.peek());
		}
		return finalRanges.peek();
	}
}

- Dhruva.Bharadwaj May 21, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 1 vote

We can pre-process the file, building word => positions_in_file hash map. Time complexity of this step: O(N) (where N is number of words in the file, and we assume there is a constant limit on length of any word).
At the query time (the code below), we combine positions of the needed words, sort them, and find the smallest window using those positions. The worst case time complexity: O(N log N).

#include <iostream>
#include <vector>
#include <unordered_map>
#include <unordered_set>

using namespace std;

pair<int, int> MinWindow(unordered_map<string, vector<int>> const &word_positions,
	vector<string> const &words_to_find)
{
	unordered_set<string> unique_words_to_find;
	for (auto const &word : words_to_find) {
		unique_words_to_find.insert(word);
	}
	vector<int> positions;
	unordered_map<int, string> word_at_position;
	for (auto const &word : unique_words_to_find) {
		auto it = word_positions.find(word);
		if (it == word_positions.end()) {
			return pair<int, int>(-1, -1);
		}
		vector<int> const &word_positions = it->second;
		positions.insert(positions.end(), word_positions.begin(), word_positions.end());
		for (int pos : word_positions) {
			word_at_position[pos] = word;
		}
	}
	if (positions.size() < words_to_find.size()) {
		return pair<int, int>(-1, -1);
	}
	sort(positions.begin(), positions.end());

	unordered_map<string, int> pattern_hash, hash;
	for (string const &word : words_to_find) {
		++pattern_hash[word];
	}
	pair<int, int> min_window = pair<int, int>(-1, -1);
	int start = 0;
	int words_matched_count = 0;
	for (int i = 0; i < positions.size(); ++i) {
		int pos = positions[i];
		string const &word = word_at_position[pos];
		++hash[word];
		if (hash[word] <= pattern_hash[word]) {
			++words_matched_count;
		}
		if (words_matched_count == words_to_find.size()) {
			while (true) {
				string const &word_at_start = word_at_position[positions[start]];
				if (hash[word_at_start] > pattern_hash[word_at_start]) {
					++start;
					--hash[word_at_start];
				} else {
					break;
				}
			}
			int window_size = positions[i] - positions[start] + 1;
			if (min_window.first == -1 ||
				window_size < min_window.second - min_window.first + 1)
			{
				min_window = pair<int, int>(positions[start], positions[i]);
			}
		}
	}
	return min_window;
}

int main()
{
	unordered_map<string, vector<int>> word_positions;
	word_positions["cat"] = {1, 8, 45};
	word_positions["rat"] = {5, 27};
	word_positions["bat"] = {64, 128};

	pair<int, int> window = MinWindow(word_positions, {"cat", "rat", "bat"});
	cout << window.first << "->" << window.second << "\n";

	return 0;
}

- Alex June 17, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

Here is a working C++11 solution with some tests. I didn't cover any special cases apart form the query not being found at all. I assume that all words in query are different.

The solution is based on the sliding window approach and is related to Rabin-Karp algorithm, with the difference of using a window that varies in size.

I first reduced the original text by removing all words that do not appear in the query. This step is not necessary, but in reduces the runtime (we can assume that it will have way more unique words than the query) and makes the code somewhat nicer with the cost of saving two additional arrays.

After that I go through all possible window sizes and for each size I initially create a hash map containing the number of occurrences of each word. After that in order to recalculate the hash map for the next window I simply have to remove the "oldest" element and add a new one, which is the idea from Rabin-Karp and brings the biggest speedup. As the hash map contains only element which occur in current window it is sufficient to simply compare its size with size of the query to determine if the current window contains all words from the query.

Let's say that:
L - total number of words in original text
Q - number of words in query
R - number of words in original text after removing all the words that do not appear in the query
W - longest word in original text (for calculating hash)
V - Longest word in query (for calculating hash)

Time complexity:
Preparing for shortening: O(Q*V)
Shortening text: O(L*W)
Calculating: O(R^2 * V)

Space complexity:
Dictionary: O(Q*V)
Shortened text: O(R*V) (not mandatory)
Hash: O(Q*V)

#include <iostream>
#include <string>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <utility>
#include <limits>

std::pair<int, int> smallest_window(const std::vector<std::string>& original, const std::vector<std::string>& query) {
  std::unordered_set<std::string> query_check; // could've also been a trie
  for(const std::string& word : query) query_check.insert(word);

  // reduce the long text by removing all words which do not exist in the query
  std::vector<std::string> reduced;
  std::vector<int> red_orig;
  for(int i = 0; i < original.size(); ++i) {
    if(query_check.find(original[i]) != query_check.end()) {
      reduced.push_back(original[i]);
      red_orig.push_back(i);
    }
  }

  std::pair<int, int> res = {-1, -1};
  int min_window = std::numeric_limits<int>::max();

  for(int window_size = query.size(); window_size <= reduced.size(); ++window_size) {
    std::unordered_map<std::string, int> content;
    // fill up the initial content
    for(int i = 0; i < window_size; ++i) {
      if(content.find(reduced[i]) == content.end()) content[reduced[i]] = 1;
      else ++content[reduced[i]];
    }
    // now move the window until a match is found
    int window_start = 0;
    while (window_start <= reduced.size() - window_size) {
      if(content.size() == query.size()) { // found a window!
        if(red_orig[window_start + window_size - 1] - red_orig[window_start] < min_window) {
          res = {red_orig[window_start], red_orig[window_start + window_size - 1]};
          min_window = red_orig[window_start + window_size - 1] - red_orig[window_start];
        }
      } else { // move window
        --content[reduced[window_start]]; // removing word furthest to the left
        if(0 == content[reduced[window_start]]) content.erase(reduced[window_start]);
        if(content.find(reduced[window_start + window_size]) == content.end()) { // adding new word
          content[reduced[window_start + window_size]] = 1;
        } else {
          ++content[reduced[window_start + window_size]];
        }
      }
      ++window_start;
    }
  }
  return res;
}

int main() {
  {
    std::vector<std::string> original = {"aa", "ab", "ac", "ad", "ae", "ba", "bb", "bc", "bd", "be", "ca", "cb", "cc", "cd", "ce", "da", "db", "dc", "dd", "de"};
    std::vector<std::string> query = {"db", "ae", "bc"};

    std::pair<int, int> res = smallest_window(original, query);
    std::cout << "Expected: 4 (\"ae\") : 16 (\"db\")" << std::endl;
    std::cout << "Result  : " << res.first << " (\"" << original[res.first] << "\") : " << res.second << " (\"" << original[res.second] << "\")" << std::endl;
  }

  {
    std::vector<std::string> original = {"aa", "ab", "ac", "ad", "aa"};
    std::vector<std::string> query = {"aa", "ad"};

    std::pair<int, int> res = smallest_window(original, query);
    std::cout << "Expected: 3 (\"ad\") : 4 (\"aa\")" << std::endl;
    std::cout << "Result  : " << res.first << " (\"" << original[res.first] << "\") : " << res.second << " (\"" << original[res.second] << "\")" << std::endl;
  }

 {
    std::vector<std::string> original = {"aa", "ab", "ac", "ad", "aa"};
    std::vector<std::string> query = {"ac"};

    std::pair<int, int> res = smallest_window(original, query);
    std::cout << "Expected: 2 (\"ac\") : 2 (\"ac\")" << std::endl;
    std::cout << "Result  : " << res.first << " (\"" << original[res.first] << "\") : " << res.second << " (\"" << original[res.second] << "\")" << std::endl;
  }

  {
    std::vector<std::string> original = {"aa", "ab", "ac", "ad", "aa"};
    std::vector<std::string> query = {"aa", "af"};

    std::pair<int, int> res = smallest_window(original, query);
    std::cout << "Expected: no pair found!" << std::endl;
    std::cout << "Result  : ";
    if(-1 == res.first) std::cout << "no pair found!" << std::endl;
    else std::cout << "pair found" << std::endl;
  }

  return 0;
}

- losmi May 02, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 votes

And one note. If I haven't used the input shrinking I could stop the search as soon as I find one proper window because the window size increases so the first one would also be the smallest one. So it's is really about analyzing the input data and discussing it.

- losmi May 03, 2017 | Flag

Comment hidden because of low score. Click to expand.

of 0 vote

1) For string matching
a) Create a trie consisting of the query keywords. The match is considered when the word in the string match a complete trie path.

2) Read the file sentence by sentence. Tokenize the readLine, and then pass the word into the trie for matching.

class range {
    int start;
    int end;
};

class current_smallest_window {
    int sz;
    list<range> rg; // potentiall have multiple  smallest ranges 
};

class smallest_window {
    ifstream ifile; 
    trie trie_f; 
    set<string> query_words;
    
    int start_index; 
    unordered_set<string> match_set;
    current_smallest_window csw;
    

    smallest_window(const char *file_path, const char *query) :
        ifile(file_path), start_index(-1) {
        // read query words into the vector 
     }
    void build_trie() {
        // iterate over the query words and create the trie 
    }
    
    void process_file_for_range() {
       char line[MAX_LINE_SIZE];
       int count = 0;
      while(1) {
          ifile.getline(line, MAX_LINE_SIZE); 
          if(!ifile.good()) {
                 if (!ifile.eof()) {
                     throw exception();
                 }
           }
           // 
          count++;
          vector<string> tokens;
          get_line_tokens(line, token); 
          if (!token.empty()) {
                // pass these to the trie for matching
                set<string> current_match_set = get_match_tokens_from_trie(tokens);
                if(current_match_set.empty()) {
                      continue;
                } 
                if(current_match_set.size() == query_words.size()) {
                           // all found 
                           insert into current_smallest_window 
                           continue;
                }
               // iterate over the current_match set and find 
               // if the current_match consist of everything included in the match_set
               // then update the start_index and match_set = current_match_set, 

              // else ( current match may have new matches but not all the existing matches)
              // stay with the same start _index, update the match_set with the new match from the 
              // current match set  
           }
      } 
       
    }
    

};

- ali.kheam June 10, 2017 | Flag Reply

CareerCup

Google Interview Question for Software Engineers

Books

Videos

Resume Review

Mock Interviews