Round 6 Question 3 : You are

Microsoft Interview Question for Software Engineer / Developers

2

of 2 votes

18
Answers
Round 6
Question 3 : You are given a word document, you have to print the words with frequencies. Now print the kth rank word in terms of frequency. Print top k words as well by frequencies
- sonesh July 12, 2015 in United States | Report Duplicate | Flag | PURGE
Microsoft Software Engineer / Developer Algorithm Arrays Coding Sorting

Email me when people comment.

An error occurred in subscribing you.

Country: United States
Interview Type: Phone Interview

More Questions from This Interview

Email me when people comment.

An error occurred in subscribing you.

Comment hidden because of low score. Click to expand.

of 0 vote

In Java, with TreeMap<String, Integer> this can be solved at O(nLogn).

Or is there a trick to it, rather ?

- X July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 votes

Your Idea is Good, Please also tell me that how will you find Kth Rank Word from TreeMap?

- vishgupta92 July 13, 2015 | Flag

Comment hidden because of low score. Click to expand.

of 0 votes

TreeMap sort itself using the keys, which in this case are words, not the frequencies. If you are going to sort it you can use whatever Map available out there, it will take O(nlogn) anyway.

- hieu.pham July 13, 2015 | Flag

Comment hidden because of low score. Click to expand.

of 0 votes

This link solves the problem in linear time:
capacode.com/?p=598

- anon July 13, 2015 | Flag

Comment hidden because of low score. Click to expand.

of 0 vote

public class CountWordsInWordFile {

    private static String filPath = "c:\\abcd.txt";


    public static void main(String[] args) throws IOException {

        FileReader reader = new FileReader(filPath);
        BufferedReader br = new BufferedReader(reader);
        String line = null;
        String[] tokens = null;
        Map<String, Integer> wordCount = new HashMap<String, Integer>();
        TreeMap<Integer, List<String>> occurances = new TreeMap<Integer, List<String>>();
        List<String> parsedList = new ArrayList<String>();

        Integer count=1;
        List<String> temp = new ArrayList<String>();
        while ((line=br.readLine())!=null)
        {
            tokens = line.split("\\s+");
            for(String token: tokens){
                if(token.trim().isEmpty()){
                    continue;
                }
                if(!wordCount.containsKey(token)){
                    wordCount.put(token,1);
                    temp.add(token);
                    occurances.put(1, temp);
                }
                else{
                    count = wordCount.get(token);
                    List<String> data = occurances.get(count);
                    data.remove(token);
                    occurances.put(count,data);
                    count++;
                    wordCount.put(token,count);
                    List<String> strings = occurances.get(count);
                    if(strings==null){
                        strings = new ArrayList<String>();
                    }
                    strings.add(token);
                    occurances.put(count,strings);
                }

            }
        }

        NavigableSet<Integer> integers = occurances.descendingKeySet();


        for(Integer i: integers){
            if(occurances.get(i).isEmpty()){
                continue;
            }
            System.out.println(i+":"+occurances.get(i));
        }
    }
}

- unknown July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

public class CountWordsInWordFile {

    private static String filPath = "c:\\abcd.txt";


    public static void main(String[] args) throws IOException {

        FileReader reader = new FileReader(filPath);
        BufferedReader br = new BufferedReader(reader);
        String line = null;
        String[] tokens = null;
        Map<String, Integer> wordCount = new HashMap<String, Integer>();
        TreeMap<Integer, List<String>> occurances = new TreeMap<Integer, List<String>>();
        List<String> parsedList = new ArrayList<String>();

        Integer count=1;
        List<String> temp = new ArrayList<String>();
        while ((line=br.readLine())!=null)
        {
            tokens = line.split("\\s+");
            for(String token: tokens){
                if(token.trim().isEmpty()){
                    continue;
                }
                if(!wordCount.containsKey(token)){
                    wordCount.put(token,1);
                    temp.add(token);
                    occurances.put(1, temp);
                }
                else{
                    count = wordCount.get(token);
                    List<String> data = occurances.get(count);
                    data.remove(token);
                    occurances.put(count,data);
                    count++;
                    wordCount.put(token,count);
                    List<String> strings = occurances.get(count);
                    if(strings==null){
                        strings = new ArrayList<String>();
                    }
                    strings.add(token);
                    occurances.put(count,strings);
                }

            }
        }

        NavigableSet<Integer> integers = occurances.descendingKeySet();


        for(Integer i: integers){
            if(occurances.get(i).isEmpty()){
                continue;
            }
            System.out.println(i+":"+occurances.get(i));
        }
    }
}

- Anonymous July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

public class CountWordsInWordFile {

    private static String filPath = "c:\\abcd.txt";


    public static void main(String[] args) throws IOException {

        FileReader reader = new FileReader(filPath);
        BufferedReader br = new BufferedReader(reader);
        String line = null;
        String[] tokens = null;
        Map<String, Integer> wordCount = new HashMap<String, Integer>();
        TreeMap<Integer, List<String>> occurances = new TreeMap<Integer, List<String>>();
        List<String> parsedList = new ArrayList<String>();

        Integer count=1;
        List<String> temp = new ArrayList<String>();
        while ((line=br.readLine())!=null)
        {
            tokens = line.split("\\s+");
            for(String token: tokens){
                if(token.trim().isEmpty()){
                    continue;
                }
                if(!wordCount.containsKey(token)){
                    wordCount.put(token,1);
                    temp.add(token);
                    occurances.put(1, temp);
                }
                else{
                    count = wordCount.get(token);
                    List<String> data = occurances.get(count);
                    data.remove(token);
                    occurances.put(count,data);
                    count++;
                    wordCount.put(token,count);
                    List<String> strings = occurances.get(count);
                    if(strings==null){
                        strings = new ArrayList<String>();
                    }
                    strings.add(token);
                    occurances.put(count,strings);
                }

            }
        }

        NavigableSet<Integer> integers = occurances.descendingKeySet();


        for(Integer i: integers){
            if(occurances.get(i).isEmpty()){
                continue;
            }
            System.out.println(i+":"+occurances.get(i));
        }
    }
}

- unknown July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

public class CountWordsInWordFile {

private static String filPath = "c:\\abcd.txt";

public static void main(String[] args) throws IOException {

FileReader reader = new FileReader(filPath);
BufferedReader br = new BufferedReader(reader);
String line = null;
String[] tokens = null;
Map<String, Integer> wordCount = new HashMap<String, Integer>();
TreeMap<Integer, List<String>> occurances = new TreeMap<Integer, List<String>>();
List<String> parsedList = new ArrayList<String>();

Integer count=1;
List<String> temp = new ArrayList<String>();
while ((line=br.readLine())!=null)
{
tokens = line.split("\\s+");
for(String token: tokens){
if(token.trim().isEmpty()){
continue;
}
if(!wordCount.containsKey(token)){
wordCount.put(token,1);
temp.add(token);
occurances.put(1, temp);
}
else{
count = wordCount.get(token);
List<String> data = occurances.get(count);
data.remove(token);
occurances.put(count,data);
count++;
wordCount.put(token,count);
List<String> strings = occurances.get(count);
if(strings==null){
strings = new ArrayList<String>();
}
strings.add(token);
occurances.put(count,strings);
}

}
}

NavigableSet<Integer> integers = occurances.descendingKeySet();

for(Integer i: integers){
if(occurances.get(i).isEmpty()){
continue;
}
System.out.println(i+":"+occurances.get(i));
}
}
}

- unknown July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

Preprocessing: Put the words and their counts (frequencies) into a hashmap: O(n)

Solution 1: using random quick selection to find kth most frequent word, this method take O(n) best and average case, and O(n^2) worst case.

Solution 2: using median of median method, take O(n) worst case to find kth most frequent word

PostProcessing: after having kth most frequent word, run the partition method that used in quick sort to get the k words that most frequent, not known which order. Takes O(n)

--> can be solve with O(n) time complexity!

- hieu.pham July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 votes

A few solutions I have in mind...

1. Use counting sort. O(n+k), where n is the size of the input and k is the max key length(can use hashcode func). To find k top words, iterate result array to find top k elements, O(k). Overall running time: O(n+k). Additional space: O(k). Space overhead may be large depending on the max key value.

2. BST of size n. Running time: O(nlogn). Space of O(n) may be less than space allocation in (1).

3. For small input, Insertion sort would suffice. If already sorted, O(n) time to find top K elements. Otherwise, O(n^2) worse case. This algo is ideal if the input is already close to being sorted. Otherwise, it's quadratic running time. Space: O(1).

- Yev July 13, 2015 | Flag

Comment hidden because of low score. Click to expand.

of 0 vote

Can be solved in linear time!
See this link: www capacode.com/?p=598

- anon July 13, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

We can use a combination of Heap and Trie based solution.

- ram.prasad.prabu July 14, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

I would have used Map-Reduce. It scales nicely for huge loads at the cost of overhead (end some extra cost) for just one small file. In the mapper you map each word with 1 (initial frequency), use the combiner to aggregate data per mapper and reduce it. The tricky part is the output. Usually the reducers will create it's own file with the data ordered according to the key. Although the keys are ordered inside the files, concatenating the files will not necessarell

- Anonymous July 31, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

Hit accidentally the "submit" button... anyway, as I was sayin', i would use map-reduce. nowadays we don't need just a file to be sorted. algorithms need to scale horizontally. som of you guys have created some nice pieces of code which may run in quadric or linear time. what happens when your "n" tends to be huge? map reduce works well for single node, pseudo multi-node and especially in the cloud. the code is easy to write, by implementing a simple mapper (words with initial frequency of one), a combiner (to optimize the job by sending partial aggregations to be shuffled) and a reducer that finishes the aggregation. the output is the tricky part. each reducer creates a file and stores the results, ordered by key. inside each file there is a total or partial order but between files the keys are nod ordered (e.g. file1: 1000 -> mother, 234 -> father; file2: 3240 ->a, 12->the; etc...). to solve this problem you can either: a) read you read the first k words from all the files, sort them and print (lame) b) you can use a single reducer resulting in a single ordered partition (a.k.a. file) and just print the top k rows (time consuming and wastes resources ) c) use a partitioned which will guarantee a total order of the keys throughout all the files. read the first k lines in the first file.

enjoy

- ionel munteanu July 31, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

I tried this in linear time.
Have a look:

#!/usr/bin/env python3
# You are given a word document,
# you have to print the words with frequencies.
# Now print the kth rank word in terms of frequency.
# Print top k words as well by frequencies.
# Found this question on careercup
#
# If n is the number of words present in the given document,
# then the below solution runs in O(n) time.
# Vikas Prasad
# 4 September 2015

import re

def frequency_counter(file_path, k):
    # for word documents
    # found this piece of code for fetching contents from docx file on SO, User name: benjamin
    import zipfile
    content = ""
    try:
        docx = zipfile.ZipFile(file_path)   # Load Docx into zipfile
    except FileNotFoundError:
        print('File not found.')
        exit()
    unpacked = docx.infolist()  # Unpack zipfile
    for item in unpacked:       # Find the /word/document.xml file in the package and assign it to variable
        if item.orig_filename == 'word/document.xml':
            content = docx.read(item.orig_filename)
            break
    # print(content)

    # docx file contain lot of xml, thus removing tags.
    # stripping <> tags and changing & to and
    stripped_content = re.compile(b'<.*?>').sub(b' ', content)  # strip tags
    stripped_content = re.compile(b'&amp;').sub(b'and', stripped_content )   # change & to and
    
    '''
    # for txt files
    try:
        fobj = open(file_path)
    except FileNotFoundError:
        print('File not found.')
        exit()
    content = fobj.read()       # please note its better to read line by line and then performing subsequent operations,
    stripped_content = content  # to make it memory efficient. Doing this way to generalise the function with docx file.
    # also for .txt files replace all b(byte) in re methods with r(str)
    '''
    # print(stripped_content)

    # Main logic starts here
    # Creating a words: frequency dictionary
    stripped_content = stripped_content.lower() # handling cases
    words_frequency = {}
    max_frequency = 1        # (will have to check this for zero word document, not an issue for now)
    words = re.compile(b'[A-Za-z]+').findall(stripped_content) # list of all the words in stripped_content
    # print(words)

    for w in words:     # create dictionary
        # if n is the number of words, this runs in O(n)
        try:
            words_frequency[w] += 1
            if words_frequency[w] > max_frequency:
                max_frequency = words_frequency[w]
        except KeyError:
            words_frequency[w] = 1
 
    # print words with frequencies
    print('Words with their frequencies:')
    for word in words_frequency:
        # O(n)
        print(word, words_frequency[word])
    print()

    # now replacing frequencies with ranks
    for word in words_frequency:
        # O(n)
        words_frequency[word] = max_frequency - words_frequency[word]

    # temp array to hold actual positions of words in the rank array
    temp_array = [[0] for _ in range(max_frequency)]
    for word in words_frequency:
        # O(n)
        temp_array[words_frequency[word]][0] = 1        # mark all positions which are a valid rank, leave rest unmarked
        temp_array[words_frequency[word]].append(word)  # append respective words

    for i, x in enumerate(temp_array[1 : ]):    # add up the array left to right to get the position for rank array
        # size of temp_array is max_frequency
        # thus this loop runs max_freqency - 1 time
        # aparrantly max_frequency <= n
        # thus this runs ins <= O(n)
        x[0] += temp_array[i][0]

    # rank array holds actual rank scaled down to 1
    rank_array = [[] for _ in range(temp_array[-1][0] + 1)]
    for word in words_frequency:
        # O(n)
        rank_array[temp_array[words_frequency[word]][0]].extend(temp_array[words_frequency[word]][1:])  # append words at the respective ranks
        temp_array[words_frequency[word]] = temp_array[words_frequency[word]][:1]       # remove words from temp array, so that they do not get reappended for same rank words in words_frequency

    print('Word(s) with rank', k, ': ', rank_array[k])
    print()

        
    print('Top', k, 'words are:')
    for i, words in enumerate(rank_array[1 : k+1]):
        print(i+1, words)
    print()

if __name__ == '__main__':
    # file_path = '/home/vikasprasad/classesPython.txt'
    # file_path = '/home/vikasprasad/cc.txt'
    # file_path = '/home/vikasprasad/CV.docx'
    file_path = input('Enter complete path for word document: ')
    # k = 5
    k = int(input('Enter value of k: '))
    frequency_counter(file_path, k)

- vikasprasad.prasad September 04, 2015 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

1. Use a Dictionary<string, int> to get the frequency of each word;
2. Use a max heap (length is k) to get the top k words or the kth word.

- Gabriel February 21, 2016 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

We can create a hash map to store word and frequency , then create a tree map to store sorted frequency and list of words , then eventually dump the content into a array ... this way we can get highest and lowest frequency easily.

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;

public class WordFrequency {

public static void main(String[] args) {

String sentence = "A word word has been been has A A A been bbb abbab bbb he he he is a good man";
String[] wordArray = sentence.split(" ");
HashMap<String, Integer> initialMap = new HashMap<String,Integer>();
TreeMap<Integer, List<String>> sortedMap = new TreeMap<Integer, List<String>>();
Object array[][];

// Create a map to store word and frequency
for (int i = 0; i < wordArray.length; i++) {
String word = wordArray[i];
Integer count = initialMap.get(word);
if(count == null){
initialMap.put(word, 1);
}else{
initialMap.put(word, ++count);
}
}

// Create a Tree map to store frequency and list of words
Iterator<String> it = initialMap.keySet().iterator();
while (it.hasNext()) {
String string = (String) it.next();
Integer count = initialMap.get(string);
List<String> list = sortedMap.get(count);
if(list == null){
list = new ArrayList<String>();
list.add(string);
sortedMap.put(count, list);
}else{
list.add(string);
sortedMap.put(count, list);
}
}

/*Store the tree map content into a 2d array
* now based on index you can easily find the word with highest and lowest frequency
*/
array = new Object[sortedMap.size()][3];

Iterator<Integer> it1 = sortedMap.keySet().iterator();
int i = 0;
while (it1.hasNext()) {
Integer integer = (Integer) it1.next();
List<String> list = sortedMap.get(integer);
array[i][0] = i;
array[i][1] = integer;
array[i][2] = list;
i++;
}

//System.out.println(sortedMap);

for (int j = 0; j < array.length; j++) {
for (int j2 = 0; j2 < array[j].length; j2++) {
System.out.print(array[j][j2]+" ");
}
System.out.println("");
}
}
}

- ultikhopdi June 22, 2016 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

std::vector<std::pair<string, int>>
    CountWords( string str, int rank )
{

    unordered_map<string,int> WordsWithRank;
    
    size_t startpos = 0;

    while ( startpos < str.size() )
    {
        int pos=str.find_first_of(". ;:?!", startpos);
        if ( pos == string::npos )
            break;
        
        string word = str.substr(startpos,pos-startpos);
        auto pp = WordsWithRank.find(word);
        
        if ( pp != WordsWithRank.end() )
        {
            (*pp).second++;

        }
        else
            WordsWithRank.insert(make_pair(word,1));
        
        while( pos < str.size() &&
              (str.substr(pos,1).find_first_of(". ;:?!")!=string::npos ) )
              {
                  pos++;
              }
        startpos=pos;
        
    }
    std::vector<std::pair<string, int>> pairs;
    pairs.reserve(WordsWithRank.size());
    
    std::copy(
                   WordsWithRank.begin(),
                   WordsWithRank.end(),
              std::back_inserter(pairs));
    
    sort(   pairs.begin(),
            pairs.end(),
         [](std::pair<string, int>& a, std::pair<string, int>& b)
         {
             return a.second > b.second;
         }
         );

    int endpos = 0;
    while ( ( endpos < pairs.size() ) && ( pairs[endpos].second >= rank ) )
    {
        endpos++;
    }
    pairs.resize(endpos);
    return pairs;
}

- drolmal April 09, 2017 | Flag Reply

CareerCup

Microsoft Interview Question for Software Engineer / Developers

Books

Videos

Resume Review

Mock Interviews