//usage: a.out prefix_of_allele_information alignment.bam [-b backbone_id] #include #include #include #include #include #include #include "alignments.hpp" struct _compatible { int weight ; double value ; } ; struct _snpInfo { char type ; // d, s, i int position ; char nucleotide ; int length ; } ; struct _mdComponent { char type ; int length ; int num ; } ; struct _result { int a, b ; double logLikelihood ; } ; std::map snpNameToId ; std::map alleleNameToId ; std::vector alleleIdToName ; std::vector snpInfo ; // When using this, we already convert the snp name into snp id std::vector< std::vector > snpLink ; // What are the allele ids associate with the snp id std::map > positionToSnp ; // map of the genomic coordinate to the snp id std::vector< std::vector > alleleSnpList ; // the list of snp ids associate with this allele std::vector alleleLength ; std::vector< struct _pair > alignmentCoords ; bool CompResult( struct _result a, struct _result b ) { return a.logLikelihood > b.logLikelihood ; } void Split( const char *s, char delimit, std::vector &fields ) { int i ; fields.clear() ; if ( s == NULL ) return ; std::string f ; for ( i = 0 ; s[i] ; ++i ) { if ( s[i] == delimit || s[i] == '\n' ) { fields.push_back( f ) ; f.clear() ; } else f.append( 1, s[i] ) ; } fields.push_back( f ) ; f.clear() ; } int main( int argc, char *argv[] ) { int i, j, k ; FILE *fp ; char buffer[10100] ; std::vector fields ; int binSize = 50 ; const char *backboneName = NULL ; // the compatilibity between the alignment and allele // The likelihood of this read from this allele struct _compatible **compatibility ; int **snpAllele ; // whether a snp showed up in the allele. Alignments alignments ; alignments.Open( argv[2] ) ; for ( i = 3 ; i < argc ; ++i ) { if ( !strcmp( argv[i], "-b" ) ) { backboneName = argv[i + 1] ; alignments.OnlyChrom( backboneName ) ; ++i ; } else { fprintf( stderr, "Unknown argument %s.\n", argv[i] ) ; exit( 1 ) ; } } // Parse the files associate with the snps // Firstly, read in the snp list have the information sprintf( buffer, "%s.snp", argv[1] ) ; fp = fopen( buffer, "r" ) ; k = 0 ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { Split( buffer, '\t', fields ) ; if ( backboneName && strcmp( fields[2].c_str(), backboneName ) ) continue ; snpNameToId[ fields[0] ] = k ; struct _snpInfo info ; info.type = fields[1][0] ; if ( info.type == 'd' ) { info.position = atoi( fields[3].c_str() ) ; info.length = atoi( fields[4].c_str() ) ; } else if ( info.type == 'i' ) { info.position = atoi( fields[3].c_str() ) ; info.length = strlen( fields[4].c_str() ) ; } else { info.position = atoi( fields[3].c_str() ) ; // notice that the snp file is 0-based index. info.length = 1 ; info.nucleotide = fields[4][0] ; } snpInfo.push_back( info ) ; std::vector< int > tmpList ; snpLink.push_back( tmpList ) ; for ( int p = 0 ; p < info.length ; ++p ) { if ( info.type != 'i' || p == 0 ) { if ( positionToSnp.find( info.position + p ) == positionToSnp.end() ) { positionToSnp[ info.position + p] = tmpList ; } positionToSnp[ info.position + p].push_back( k ) ; } } ++k ; } fclose( fp ) ; // Read in the link file. Determine the id of alleles and the association // of alleles and snps. // TODO: obtain the length of each allele and take the length into account in the statistical model // Add the id for the backbound int backboneLength = 0 ; sprintf( buffer, "%s_backbone.fa", argv[1] ) ; fp = fopen( buffer, "r" ) ; /*for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i ) ; buffer[i] = '\0' ; std::string backboneName( buffer + 1 ) ; alleleNameToId[ backboneName ] = 0 ; alleleIdToName.push_back( backboneName ) ;*/ bool start = false ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { if ( buffer[0] == '>' ) { for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i ) ; buffer[i] = '\0' ; if ( !strcmp( backboneName, buffer + 1 ) ) { start = true ; } else if ( start ) break ; } if ( start && buffer[0] != '>' ) { int len = strlen( buffer ) ; if ( buffer[len - 1 ] == '\n' ) backboneLength += len - 1 ; else backboneLength += len ; } } fclose( fp ) ; /*k = 0 ; if ( k == 0 ) { std::vector tmpList ; alleleSnpList.push_back( tmpList ) ; alleleLength.push_back( backboneLength ) ; }*/ // scanning the link file sprintf( buffer, "%s.link", argv[1] ) ; fp = fopen( buffer, "r" ) ; k = 0 ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { std::vector tmpFields ; Split( buffer, '\t', tmpFields ) ; // skip the snps from other backbones if ( snpNameToId.find( tmpFields[0] ) == snpNameToId.end() ) continue ; int snpId = snpNameToId[ tmpFields[0] ] ; Split( tmpFields[1].c_str(), ' ', fields ) ; int size = fields.size() ; for ( i = 0 ; i < size ; ++i ) { if ( alleleNameToId.find( fields[i] ) == alleleNameToId.end() ) { //printf( "%s %d\n", fields[i].c_str(), k ) ; alleleNameToId[ fields[i] ] = k ; alleleIdToName.push_back( fields[i] ) ; std::vector tmpList ; alleleSnpList.push_back( tmpList ) ; alleleLength.push_back( backboneLength ) ; ++k ; } int alleleId = alleleNameToId[ fields[i] ] ; //if ( snpId == 118 ) // printf( "%s: %s %d\n", tmpFields[0].c_str(), fields[i].c_str(), alleleId ) ; snpLink[ snpId ].push_back( alleleId ) ; alleleSnpList[ alleleId ].push_back( snpId ) ; if ( snpInfo[ snpId ].type == 'd' ) { alleleLength[ alleleId ] -= snpInfo[ snpId ].length ; } else if ( snpInfo[ snpId ].type == 'i' ) { alleleLength[ alleleId ] += snpInfo[ snpId ].length ; } } } fclose( fp ) ; int numOfAllele = alleleIdToName.size() ; int numOfSnps = snpLink.size() ; snpAllele = new int* [numOfSnps] ; for ( i = 0 ; i < numOfSnps ; ++i ) { snpAllele[i] = new int[numOfAllele] ; memset( snpAllele[i], 0, sizeof( int ) * numOfAllele ) ; } for ( i = 0 ; i < numOfSnps ; ++i ) { int size = snpLink[i].size() ; for ( j = 0 ; j < size ; ++j ) { snpAllele[i][ snpLink[i][j] ] = 1 ; } } // Compute the compatbility score for each alignment and the allele // Get the number of alignment int numOfAlignments = 0 ; while ( alignments.Next() ) ++numOfAlignments ; alignments.Rewind() ; compatibility = new struct _compatible*[numOfAlignments] ; for ( i = 0 ; i < numOfAlignments ; ++i ) { compatibility[i] = new struct _compatible[ numOfAllele ] ; for ( j = 0 ; j < numOfAllele ; ++j ) { compatibility[i][j].value = 0 ;//-log( (double)alleleLength[j] ) / log( 10.0 ); } } i = 0 ; bool *snpHit = new bool[ numOfSnps ] ; while ( alignments.Next() ) { struct _pair coord = alignments.segments[0] ; alignmentCoords.push_back( coord ) ; memset( snpHit, 0, sizeof( bool ) * numOfSnps ) ; Split( alignments.GetFieldZ( "Zs" ), ',', fields ) ; int size = fields.size() ; for ( k = 0 ; k < size ; ++k ) { std::vector subfields ; Split( fields[k].c_str(), '|', subfields ) ; int snpId = snpNameToId[ subfields[2] ] ; snpHit[ snpId ] = true ; } for ( k = coord.a ; k <= coord.b ; ++k ) { int size = positionToSnp[k].size() ; for ( int l = 0 ; l < size ; ++l ) { // if this SNP is hit. Then other allele don't have this snp // will deduct its likelihood //TODO: the deduction can be based on the quality score of the read int tag = 0 ; int snpId = positionToSnp[k][l] ; if ( snpHit[ snpId ] ) { tag = 0 ; } else { // if this SNP is not hit, then every allele containing this snp // will deduct its likelihood tag = 1 ; } for ( j = 0 ; j < numOfAllele ; ++j ) { if ( snpAllele[ snpId ][j] == tag ) { int v = -2 ; //if ( snpInfo[ snpId ].type == 'd' || snpInfo[ snpId ].type == 'i' ) // v = -4 * snpInfo[ snpId ].length ; if ( snpInfo[ snpId ].type == 'd' && snpInfo[ snpId ].position < k && k != coord.a ) { // The penality has already been subtracted. v = 0 ; } compatibility[i][j].value += v ; /*if ( i == 8 && j == 78 ) { printf( "Bad snp %d: %d %d\n", tag, k, positionToSnp[k][l] ) ; }*/ } } } } ++i ; } //printf( "%d %d\n", numOfAlignments, numOfAllele ) ; // Now, let's consider every pair of alleles, and compute its log likelihood double **logLikelihood ; logLikelihood = new double *[ numOfAllele] ; for ( j = 0 ; j < numOfAllele ; ++j ) { logLikelihood[j] = new double[ numOfAllele ] ; //memset( logLikelihood[j], 0, sizeof( double ) * numOfAllele ) ; for ( k = 0 ; k < numOfAllele ; ++k ) logLikelihood[j][k] = 0 ; } int prevBin = -1 ; double assignJBin = 0 ; double assignKBin = 0 ; for ( j = 0 ; j < numOfAllele ; ++j ) { for ( k = j ; k < numOfAllele ; ++k ) { double binAdjust = 0 ; double averageRead = ( (double)numOfAlignments ) / (double)( alleleLength[j] + alleleLength[k] ) * binSize ; for ( i = 0 ; i < numOfAlignments ; ++i ) { double vj = compatibility[i][j].value ; double vk = compatibility[i][k].value ; double weightJ = 0, weightK = 0 ; if ( vj == vk ) { weightJ = weightK = 0.5 ; } else if ( vj == vk + 2 ) { if ( vj == 0 ) { weightJ = 1 ; } else { weightJ = 0.99 ; weightK = 0.01 ; } } else if ( vk == vj + 2 ) { if ( vk == 0 ) { weightK = 1 ; } else { weightJ = 0.01 ; weightK = 0.99 ; } } else { if ( vk > vj ) weightK = 1 ; else weightJ = 1 ; } double l = weightJ * compatibility[i][j].value + weightK * compatibility[i][k].value ; if ( alignmentCoords[i].a / binSize != prevBin ) { if ( prevBin != -1 && ( assignJBin > averageRead + 4 * sqrt( averageRead ) || assignKBin > averageRead + 4 * sqrt( averageRead ) ) ) { //if ( j == 8 && k == 78 ) // printf( "%lf: %lf %lf %d %d\n", averageRead, assignJBin, assignKBin, alleleLength[j], alleleLength[k] ) ; binAdjust -= 4 ; } prevBin = alignmentCoords[i].a / binSize ; assignJBin = 0 ; assignKBin = 0 ; } assignJBin += weightJ ; assignKBin += weightK ; /*if ( j == 8 && k == 78 && l < 0 ) { printf( "Bad alignment %d (%s %s). %lf %lf: %lf\n", i, alleleIdToName[j].c_str(), alleleIdToName[k].c_str(), compatibility[i][j].value, compatibility[i][k].value, l ) ; }*/ logLikelihood[j][k] += l ; } logLikelihood[j][k] += ( -log( (double)alleleLength[j] ) / log(10.0 ) - log( (double)alleleLength[k] ) / log(10.0) ) ; logLikelihood[j][k] += binAdjust ; } } // Find the result double max ; int maxj = -1 ; int maxk = -1 ; std::vector< struct _result > results ; for ( j = 0 ; j < numOfAllele ; ++j ) { for ( k = j ; k < numOfAllele ; ++k ) { if ( maxj == -1 || logLikelihood[j][k] > max ) { maxj = j ; maxk = k ; max = logLikelihood[j][k] ; } struct _result r ; r.a = j ; r.b = k ; r.logLikelihood = logLikelihood[j][k] ; results.push_back( r ) ; } } //printf( "%s %s %lf\n", alleleIdToName[ maxj ].c_str(), alleleIdToName[ maxk ].c_str(), max) ; //printf( "%lf\n", logLikelihood[124][128] ) ; if ( results.size() == 0 ) { printf( "-1 -1 -1\n" ) ; exit( 1 ) ; } std::sort( results.begin(), results.end(), CompResult ) ; i = 0 ; printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), results[i].logLikelihood ) ; k = results.size() ; for ( i = 1 ; i < k ; ++i ) { if ( results[i].logLikelihood != results[0].logLikelihood ) break ; printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), results[i].logLikelihood ) ; } return 0 ; }