Facebook Interview Question Software Engineer / Developers
0of 0 votesGattaca
You have a DNA string that you wish to analyze. Of particular interest is which intervals of the string represent individual genes. You have a number of "gene predictions", each of which assigns a score to an interval within the DNA string, and you want to find the subset of predictions such that the total score is maximized while avoiding overlaps. A gene prediction is a triple of the form (start, stop, score). start is the zero-based index of the first character in the DNA string contained in the gene. stop is the index of the last character contained in the gene. score is the score for the gene.
Input Specification
Your program will be passed the name of an input file on the command line. The contents of that file are as follows.
The first line of the input contains only n, the length of the DNA string you will be given.
The next ceiling(n / 80) lines each contain string of length 80 (or n % 80 for the last line) containing only the characters 'A', 'C', 'G', and 'T'. Concatenate these lines to get the entire DNA strand.
The next line contains only g, the number of gene predictions you will be given.
The next g lines each contain a whitespace-delimited triple of integers of the form
<start> <stop> <score> representing a single gene prediction. No gene predictions will exceed the bounds of the DNA string or be malformed (start is non-negative and no more than stop, stop never exceeds n - 1).
Example Input:
100
GAACTATCGCCCGTGCGCATCGCCCGTCCGACCGGCCGTAAGTCTATCTCCCGAGCGGGCGCCCGATCTCAAGTGCACCT
CACGGCCTCACGACCGTGAG
8
43 70 27
3 18 24
65 99 45
20 39 26
45 74 26
10 28 20
78 97 23
0 9 22
Output Specification
Print to standard out the score of the best possible subset of the gene predictions you are given such that no single index in the DNA string is contained in more than one gene prediction, followed by a newline. The total score is simply the sum of the scores of the gene predictions included in your final result.
When constructing your output, you may only consider genes exactly as they are described in the input. If you find the contents of a gene replicated elsewhere in the DNA string, you are not allowed to treat the second copy as a viable gene. Your solution must be fast and efficient to be considered correct by the robot.
Example Output:
100
IMO its a good variation of longest increasing sub sequence.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct
{
int start;
int end;
int score;
}genePred_t;
void readInputFromFile(char* fileName, int* pNumOfGenePred, genePred_t** ppGenePred)
{
FILE* fp = 0;
char* pFilePtr=0, *pGene=0, *token=0;
int len=0, curLen=0,bytes=0,index=0, geneLen=0, numOfGenePred=0;
genePred_t* pGenePred = 0;
/*open the file*/
fp = fopen(fileName, "r");
/*get length of gene*/
len = getline(&pFilePtr, &bytes, fp);
pFilePtr[len-1]=0;
geneLen = atoi(pFilePtr);
free(pFilePtr);
pFilePtr=0;
len=0;
/*get the gene*/
pGene = (char*)malloc(geneLen+1);
memset(pGene,0x00,geneLen+1);
while(len<geneLen)
{
curLen = getline(&pFilePtr, &bytes, fp);
pFilePtr[curLen-1]=0;
strcat(pGene,pFilePtr);
free(pFilePtr);
pFilePtr=0;
len += curLen-1;
}
/*get the number of gene predection*/
len = getline(&pFilePtr, &bytes, fp);
pFilePtr[len-1]=0;
numOfGenePred = atoi(pFilePtr);
free(pFilePtr);
pFilePtr=0;
/*get Gene predection*/
pGenePred = (genePred_t*)malloc(sizeof(genePred_t)*numOfGenePred);
len=numOfGenePred;
while(len)
{
curLen = getline(&pFilePtr, &bytes, fp);
pFilePtr[curLen-1]=0;
token = strtok(pFilePtr," ");
pGenePred[index].start = atoi(token);
token = strtok(0," ");
pGenePred[index].end = atoi(token);
token = strtok(0," ");
pGenePred[index].score = atoi(token);
free(pFilePtr);
pFilePtr=0;
len--;
index++;
}
*pNumOfGenePred = numOfGenePred;
*ppGenePred = pGenePred;
fclose(fp);
}
void addScore(int maxIndex, int* pPrev, genePred_t* pGenePred, int* Score)
{
if(maxIndex==0)
{
return;
}
addScore(pPrev[maxIndex],pPrev,pGenePred,Score);
printf("[%d]", pGenePred[maxIndex-1].score);
*Score = *Score + pGenePred[maxIndex-1].score;
}
int addAllScore(genePred_t* pGenePred, int* pIncSeq, int* pPrev, int numOfGenePred)
{
int maxIndex=0,i=0,score=0,max=0;
for(i=0;i<=numOfGenePred;i++)
{
if(max<pIncSeq[i])
{
max=pIncSeq[i];
maxIndex = i;
}
}
addScore(maxIndex, pPrev, pGenePred, &score);
return score;
}
int compare (void* arg1, void* arg2)
{
genePred_t* pGenePred1 = (genePred_t*)arg1;
genePred_t* pGenePred2 = (genePred_t*)arg2;
return (pGenePred1->start > pGenePred2->start)?(1):(-1);
}
void computeMaxScore(genePred_t* pGenePred,int numOfGenePred)
{
int i=0,j=0,max=0,score=0, maxScore=0, curScore=0;
int* pIncSeq=0,*pPrev=0,*pScore=0;
qsort(pGenePred, numOfGenePred, sizeof(genePred_t),compare);
pIncSeq = (int*)malloc(sizeof(int)*(numOfGenePred+1));
pPrev = (int*)malloc(sizeof(int)*(numOfGenePred+1));
pScore = (int*)malloc(sizeof(int)*(numOfGenePred+1));
memset(pIncSeq,0x00,sizeof(int)*(numOfGenePred+1));
memset(pPrev,0x00,sizeof(int)*(numOfGenePred+1));
memset(pScore,0x00,sizeof(int)*(numOfGenePred+1));
for(i=0;i<numOfGenePred;i++)
{
max=0;
curScore=0;
for(j=0;j<=i;j++)
{
if(pGenePred[i].start > pGenePred[j].end)
{
if(max<=pIncSeq[j+1])
{
curScore = pScore[j]+pGenePred[i].score;
if(maxScore < curScore)
{
pScore[j+1]=curScore;
maxScore=curScore;
max=pIncSeq[j+1];
pPrev[i+1] = j+1;
}
}
}
else
{
if(pScore[j+1]<pGenePred[i].score)
pScore[j+1]=pGenePred[i].score;
}
}
max=max+1;
pIncSeq[i+1]=max;
}
score = addAllScore(pGenePred,pIncSeq,pPrev,numOfGenePred);
printf("\n");
printf("Score: %d\n",score);
free(pIncSeq);
free(pPrev);
free(pScore);
}
int main(int argc,char* argv[])
{
int numOfGenePred=0;
genePred_t* pGenePred=0;
if(argc < 2)
{
printf("Invalid Input !!\n");
return 0;
}
readInputFromFile(argv[1],&numOfGenePred, &pGenePred);
computeMaxScore(pGenePred,numOfGenePred);
return 0;
}
I didn't run the code other than the input given in this post. Please let me know if there is a bug or mistake in logic. Code expects valid input text file with newline in the end.
I'll explain the detailed logic latter.
lengthy question lengthy code !!
I understood something but can u plz explain the logic completely?
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <cmath>
#include <stdlib.h>
typedef struct {
long start, finish, weight;
} Interval;
Interval *I;
long *M;
long *P;
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
int compare(const void * left, const void * right) {
return (((Interval *)left)->finish - ((Interval *) right)->finish);
}
int IntervalSearch(int start, int high){
if (high == -1) return -1;
int low = 0;
int best = -1;
int mid;
int finish;
while (low <= high) {
mid = (low + high) /2 ;
finish = I[mid].finish;
if (finish >= start) {
high = mid-1;
} else {
best = mid;
low = mid + 1;
}
}
return best;
}
long Compute_Opt(long j) {
if (j == 0)
return 0;
if (M[j] != -1l) {
return M[j];
}
M[j] = MAX(I[j].weight + Compute_Opt(P[j]), Compute_Opt(j - 1));
return M[j];
}
int main( int argc, char **argv) {
if ( argc <= 1 ) {
return 0;
}
std::ifstream dnaInput(argv[1]);
if ( dnaInput.peek() == EOF ) {
return 0;
}
int lenDnaStr;
dnaInput >> lenDnaStr ;
for(int index = 0; index <= std::ceil(((float)lenDnaStr)/80.0); index++) {
dnaInput.ignore( lenDnaStr + 1, '\n');
}
int numOfIv = 0;
dnaInput >> numOfIv;
if ( numOfIv == 0 ) {
return 0;
}
numOfIv = numOfIv + 1;
I = (Interval *) malloc (sizeof(Interval) * numOfIv);
M = (long *)malloc(sizeof(long) * numOfIv);
P = (long *)malloc(sizeof(long) * numOfIv);
Interval t;
t.start = 0;
t.finish = 0;
t.weight = 0;
I[0] = t;
for ( int index = 1; index < numOfIv; index++) {
int start = 0, finish = 0, weight = 0;
dnaInput >> start >> finish >> weight;
Interval t;
t.start = start;
t.finish = finish;
t.weight = weight;
I[index] = t;
}
dnaInput.close();
qsort(I, numOfIv, sizeof(Interval), compare);
int best;
for (int index = 1; index < numOfIv; index++){
M[index] = -1l;
best = IntervalSearch(I[index].start,index - 1);
if (best != -1) {
P[index] = best;
} else {
P[index] = 0;
}
}
long res = Compute_Opt(numOfIv - 1);
free ( P );
free ( I );
free ( M );
std::cout << res << std::endl;
}
#include <iostream>
#include <algorithm>
#include <vector>
#include <cstring>
using namespace std;
struct Endpoint {
int loc;
int score;
int best_score;
Endpoint * other;
Endpoint (int _loc, int _score) : loc(_loc), score(_score), best_score(0) {}
};
bool comp_endpt_ptr(Endpoint * e1, Endpoint * e2) {
return e1->loc < e2->loc;
}
int N, G;
vector<Endpoint *> endpts;
int main() {
// ignore actual DNA sequence... we only need this if we want to
// print results
string s;
cin >> N;
for (int i = 0; i < (N - 1)/80 + 1; i++)
cin >> s;
cin >> G;
for (int i = 0; i < G; i++) {
int start, end, score;
cin >> start >> end >> score;
Endpoint * left = new Endpoint(start, score);
Endpoint * right = new Endpoint(end + 1, score);
left->other = right;
right->other = left;
endpts.push_back(left);
endpts.push_back(right);
}
sort(endpts.begin(), endpts.end(), comp_endpt_ptr);
for (int i = 1; i < endpts.size(); i++) {
Endpoint * pt = endpts[i];
if (pt->other->loc > pt->loc)
pt->best_score = endpts[i - 1]->best_score;
else
pt->best_score = max(endpts[i - 1]->best_score,
pt->score + pt->other->best_score);
// cout << pt->loc << ": " << pt->best_score << endl;
}
cout << endpts.back()->best_score << endl;
}
<pre lang="" line="1" title="CodeMonkey8253" class="run-this">code in C#
class DNA : IComparable<DNA>
{
public int Start { get; set; }
public int End { get; set; }
public int Score { get; set; }
public DNA( int s, int e, int ds )
{
this.Start = s;
this.End = e;
this.Score = ds;
}
public int CompareTo( DNA d )
{
return this.End.CompareTo( d.End );
}
}
public static void FindMaxScore()
{
List<DNA> dnas = new List<DNA>();
// read input
dnas.Sort();
int N = 100;
int[] scores = new int[N];
int[] seleceted = new int[N];
for (int i = 0; i < dnas.Count; i++)
{
DNA d = dnas[i];
if (d.Score + scores[d.Start] >= scores[d.End - 1])
{
scores[d.End] = d.Score + scores[d.Start];
seleceted[d.End] = i+1;
}
else
{
scores[d.End] = scores[d.End - 1];
seleceted[d.End] = seleceted[d.End - 1];
}
int iEnd = N;
if (i < dnas.Count - 1)
{
iEnd = dnas[i + 1].End;
}
for (int j = d.End + 1; j < iEnd; j++)
{
scores[j] = scores[j - 1];
seleceted[j] = seleceted[j - 1];
}
}
//output max score
Console.WriteLine( scores[N - 1] );
//output selected DNA strings
int idx = seleceted[N - 1];
while (idx > 0)
{
Console.WriteLine( idx - 1 );
idx = seleceted[dnas[idx - 1].Start];
}
}</pre><pre title="CodeMonkey8253" input="yes">
</pre>

I think it is a weighted interval scheduling. It can be solved by Dynamic Programming i O(N) time. Given a set of DNA substrings --> intervals (structured in term of <bgn,end,score>). I[j] is not overlapped with I[k] if end_j < bgn_k. The trick is to sort the interval in earliest finish. We shall have an array of interval I of size N. Given OPT(j) be the optimal path defined on the first jth interval. OPT(j) has two options; one that include I[j] and one doesn't.
- Saimok on July 04, 2011 Edit | Flag ReplyFor the path that include I[j], the problem is down to OPT(prev(j)) where prev(j) is the closest interval to the left of I[j] which does not overlap I[j]. For the path that exclude I[j], the problem is down to OPT(j-1). So the pseudo code is: 1. Sort(I) by the earliest finish (last DNA position) 2. Given a function prev(j) begin at I[j] and move to the left of I[j] until I[k].end < I[j].bgn this is a previous non-overlap of I[j]. 3. The recursion will look like this; int OPT(int j){ if(j < 0) return 0; //base case else { return Max(OPT(j-1), OPT(pre(j))); } } For DP, create an OPT table of size N and initialize OPT[0] to I[0].score. For j=1 to N-1 simply call OPT(j). This is an O(N) complexity.