Check if two DOM Trees have th

Google Interview Question for Software Engineers

0

of 0 votes

6
Answers
Check if two DOM Trees have the same text.
e.g. <html>hello</html>, <html>hello</html> should be the same text

DOMNode class definition (string tag, string text, bool isText, vector<DOMNode*> children)
- wtcupup2017 September 28, 2017 in United States | Report Duplicate | Flag | PURGE
Google Software Engineer Algorithm

Email me when people comment.

An error occurred in subscribing you.

Country: United States
Interview Type: In-Person

Email me when people comment.

An error occurred in subscribing you.

Comment hidden because of low score. Click to expand.

of 1 vote

We can extract the text by traversing leaves from left to right (similar to inorder traversal).
This can be improved by extracting text piece by piece, and comparing the growing text strings. As soon as we find a mismatch we stop. The code below implements the idea.

#include <iostream>
#include <string>
#include <stack>
#include <vector>

using namespace std;

class Node {
	public:
		Node(string const &tag, string const &text)
		{
			if (!tag.empty()) {
				is_text_ = false;
				tag_ = tag;
			} else {
				is_text_ = true;
				text_ = text;
			}
		}
		void AddChild(Node *n)
		{
			if (n) {
				children_.push_back(n);
			}
		}

		string tag_, text_;
		bool is_text_;
		vector<Node const *> children_;
};

class TextExtractor {
	public:
		TextExtractor(Node const *n)
		{
			st_.push(n);
		}
		string NextText()
		{
			while (!st_.empty()) {
				Node const *n = st_.top();
				st_.pop();
				if (n) {
					if (n->is_text_) {
						return n->text_;
					} else {
						for (int i = n->children_.size() - 1; i >= 0; --i) {
							st_.push(n->children_[i]);
						}
					}
				}
			}
			return "";
		}
		bool Done() const
		{
			return st_.empty();
		}

	private:
		stack<Node const *> st_;
};

bool SameText(Node const *n1, Node const *n2)
{
	TextExtractor e1(n1), e2(n2);
	string s1, s2;
	while (!e1.Done() ||
		   !e2.Done())
	{
		if (!e1.Done() &&
			(s1.size() < s2.size() || e2.Done()))
		{
			s1 += e1.NextText();
		}
		if (!e2.Done() &&
			(s2.size() <= s1.size() || e1.Done()))
		{
			s2 += e2.NextText();
		}
		for (int i = 0; i < s1.size() && i < s2.size(); ++i) {
			if (s1[i] != s2[i]) {
				return false;
			}
		}
		if (s1.size() < s2.size()) {
			s2 = s2.substr(s1.size());
			s1.clear();
			
		} else {
			s1 = s1.substr(s2.size());
			s2.clear();
		}
	}
	return s1 == s2;
}

int main()
{
	//<html><p>hello</p></html>
	Node n1("html", "");
	Node n2("p", "");
	Node n3("", "hello");
	n1.AddChild(&n2);
	n2.AddChild(&n3);

	//<html><p><b>h</b>ello</p></html>
	Node n10("html", "");
	Node n11("p", "");
	Node n12("b", "");
	Node n13("", "h");
	Node n14("", "ello");
	n10.AddChild(&n11);
	n11.AddChild(&n12);
	n11.AddChild(&n14);
	n12.AddChild(&n13);

	cout << SameText(&n1, &n10) << "\n";
	return 0;
}

- Alex September 28, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 1 vote

import java.util.ArrayList;
import java.util.List;

/*
Check if two DOM Trees have the same text.
e.g. <html><p>hello</p></html>, <html><p><b>h</b>ello</p></html> should be the same text

DOMNode class definition (string tag, string text, bool isText, vector<DOMNode*> children)
 */
public class TextInDomTree {
    static class DOMNode {
        String tag;
        String text;
        boolean isText;
        private List<DOMNode> children;

        void addChild(DOMNode n) {
            if (n == null) return;
            if (children == null) children = new ArrayList<>();
            children.add(n);
        }

        DOMNode(String tag, String text ) {
            this.tag = tag;
            this.text = text;
            this.isText = text != null && text.length() >= 1;
        }
    }

    public static void main(String[] args) {
        //<html><p>hello</p></html>
        {
            DOMNode html = new DOMNode("html", "");
            DOMNode p = new DOMNode("p", "");
            p.addChild(new DOMNode("text", "hello"));
            html.addChild(p);

            StringBuilder sb = new StringBuilder();
            getText(html, sb);
            System.out.println(sb);
        }

        //<html><p><b>h</b>ello</p></html>
        {
            DOMNode html = new DOMNode("html", "");
            DOMNode p = new DOMNode("p", "");
            DOMNode b = new DOMNode("b", "");
            DOMNode h =new DOMNode("text", "h");
            b.addChild(h);
            p.addChild(b);
            DOMNode ello =new DOMNode("text", "ello");
            p.addChild(ello);
            html.addChild(p);

            StringBuilder sb = new StringBuilder();
            getText(html, sb);
            System.out.println(sb);
        }
    }

    /*
    Traverse DOM tree and extract text
     */
    private static void getText(DOMNode node, StringBuilder sb) {

        if (node == null) return;
        if (node.isText) {
            sb.append(node.text);
            return;
        }
        for (DOMNode n : node.children)
            getText(n, sb);
    }
}

- Makarand September 29, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

cool question. it really depends how exact the comparison must be and what this is used for (e.g. fraud detection). An alternative to Alex's exact match is:
1) extract all words in any order, transform words into word-ids using a growing dictionary. 2) create a vector for each page, where the word-id is the index and the value is the amount of occurrences of this word.
3) apply a similarity measure between the vectors (like pearson similiarity etc.) and then, if you get a very high similarity, you have a very high chance the pages are equal.

This method is harder to fake, if e.g. somebody has a high interest in bypassing the "exact match" method because same text might have meant "looks the same for the user" and there, the order of leaves definitely is not relevant since the creator can apply any mean tricks to avoid being recognized as duplicate.

If you want to have some tolerance in spelling as well, you might transform the words into some phonetic versions (where similar sounding words get the same phonetic "value" thus are the same words). ...

- Chris September 28, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

Solution Idea 1: Let's call the two trees n and m. Iterate over all of the nodes in tree n using a pre-order traversal and using a queue store the sequence of characters
from n (call this string qn). Perform a pre-order traversal of m, everytime you encounter a node with a text string, compare the characters of the text string with characters in qn.
Dequeue characters from qn as they match characters in text strings of m. If there's a character mismatch, or qn becomes empty before finishing traversal of text nodes in m, or
qn has nodes left but we've finished visiting all text nodes of m return false. Otherwise return true.
Time complexity: O(nm) where n is the number of nodes in the tree and m is the length of the longest text string. Space complexity: O(nm) from the queue.

Solution Idea 2: Do an iterative post order traversal with two stacks (s1 for nodes in n  and s2 for nodes in m). Also keep two index variables i (for indices in text strings from n)
and j (for indices in text strings from m). When the top entry of s1 and top entry of s2 are both
nodes with text strings, do a string comparison of these strings(text1 for s1's string and text2 for s2's string). In the string comparison, if text1.charAt(i) != text2.charAt(j)
return false. If i == text1.length && j < text2.length, pop off the top entry in s1 set i = 0 and continue post order traversal of both trees (do the same if j == text2.length but i < text1.length).
At the end of the post order traversal of both trees, if both s1 and s2 are empty return true. Otherwise, return false. This solution has the same worst case
time complexity and space complexity idea 1 but is more optimal in some cases. Consider cases where the string formed by n and m differ at the begining (ie.their starting character). We 
could avoid traversing n in its entirety to find this difference saving both time and space).

Code for Idea 2:
Class Node{
	String tag;
	String text;
	boolean isText;
	List<Node> child;
}

public boolean isSame(Node n, Node m){

	Stack<Node> s1 = new Stack<Node>//n nodes
	Stack<Node> s2 = new Stack<Node>//m nodes
	s1.push(n);
	s2.push(m);
	int i = 0;
	int j = 0;
	while(!s1.isEmpty() && !s2.isEmpty()){
		
		Node top1 = s1.peek();
		Node top2 = s2.peek();
		if(top1.isText && top2.isText){
			while(i < top1.text.length() && j < top2.text.length()){
				if(top1.text.charAt(i) != top2.text.charAt(j)){
					return false;
				}
				i++;
				j++;
			}
			if(i == top1.text.length()){
				s1.pop();
				i = 0;
			}
			if(j == top2.text.length()){
				s2.pop();
				j = 0;
			}
		}else if (!top1.isText && top2.isText){
			fillStack(s1,s1.pop());
		}else if(top1.isText && !top2.isText){
			fillStack(s2,s2.pop());
		}else{
			fillStack(s1,s1.pop());
			fillStack(s2,s2.pop());
		}
	
	}
	return s1.isEmpty() && s2.isEmpty();
}

private void fillStack(Stack<Node> st, Node curr){
	for(int i = curr.child.size() - 1; i >= 0; i--){
		st.push(curr.child.get(i));
	}
}

- divm01986 September 29, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

I search both trees simultaneously in a BFS manner, to find the next text node in each tree, and update the hashes on the min common length of these new texts, so the hashed are always on strings with the same total length.

import hashlib
import Queue

class DOMNode:
    def __init__( self, tag, text, isText, children ):
        self.tag = tag
        self.text = text
        self.isText = isText
        self.children =  children

def addToQueue( q, children ):
    for child in children:
        q.put( child )


def getNextTextBFS( q, text, lengths, index ):
    while not q.empty():
        t = q.get()
        addToQueue( q, t.children )
        if not t.isText:
            continue
        text[index] += t.text
        lengths[index] += len(text[index])
        break

def compare( q1, q2, hash1, hash2 ):
    text = [ '', '' ]
    lengths = [ 0, 0 ]
    while not q1.empty() or not q2.empty() :
        getNextTextBFS( q1, text, lengths, 0 )
        getNextTextBFS( q2, text, lengths, 1 )

        if ( text[0] == '' or text[1] == '' ) \
            and lengths[0] != lengths[1] :
            return False

        if lengths[0] == lengths[1]:
            hash1.update( text[0] )
            hash2.update( text[1] )
            text = [ '', '' ]
            lengths = [ 0, 0 ]

        elif lengths[0] > lengths[1]:
            hash1.update( text[0][0:lengths[1]])
            text[0] = text[0][lengths[1]:]
            lengths[0] -= lengths[1]
            hash2.update( text[1] )
            text[1] = ''
            lengths[1] = 0


        else:
            hash1.update( text[0] )
            text[0] = ''
            lengths[0] = 0
            hash2.update( text[1][0:lengths[0]] )
            text[1] = text[1][lengths[0]:]
            lengths[1] -= lengths[0]

        if (hash1.digest() != hash2.digest()):
            return False


    return True


def compareTreesText( tree1, tree2 ):
    hash1 = hashlib.md5()
    hash2 = hashlib.md5()
    q1 = Queue.Queue()
    q2 = Queue.Queue()
    q1.put( tree1 )
    q2.put( tree2 )
    return compare( q1, q2, hash1, hash2 )


if __name__ == "__main__":
    p1 = DOMNode( 'p', 'hello', True, [] )
    html1 = DOMNode( 'html', '', False, [ p1 ] )
    p2 = DOMNode( 'p', 'ello', True, [] )
    html2 = DOMNode( 'html', 'h', True, [ p2 ] )

    print compareTreesText( html1, html2 )

    p1 = DOMNode('p', 'hello, ', True, [])
    p2 = DOMNode('p', 'Would you Like Some tea', True, [])
    html1 = DOMNode('html', '', False, [p1, p2])
    p3 = DOMNode( 'p', 'Like Some tea', True, [])
    title1 = DOMNode( 'title', 'Would you ', True, [ p3 ] )
    html2 = DOMNode( 'html', 'hello, ', True, [ title1 ] )

    print compareTreesText( html1, html2 )

- TGoofnik October 07, 2017 | Flag Reply

Comment hidden because of low score. Click to expand.

of 0 vote

I have a question here: is “text” only the text or the whole inner HTML?
Because if it’s just the text (aka. “ello” in the second tree sample), then we don’t know whether the node’s or children’s text comes first. For instance, the two trees below are different, but an algorithm that compares them may think they’re the same.

<html>

h
ello

</html>

<html>

ello
h

</html>

Even worse, we may not be able to detect the following trees have the same text:

<html>

h
ello

</html>

<html>

h
ello

</html>

If DOMNode’s text attribute contains the HTML, then we can try to parse it and figure out if we start with the text or with children (but I don’t it does...).
Does it make sense or am I missing something here?

- FernandoVieiraDaSilva October 08, 2017 | Flag Reply

CareerCup

Google Interview Question for Software Engineers

Books

Videos

Resume Review

Mock Interviews