954 |
|
return false; |
955 |
|
} |
956 |
|
|
957 |
+ |
bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
958 |
+ |
BowtieHit& bh, |
959 |
+ |
bool strip_slash, |
960 |
+ |
char* name_out, |
961 |
+ |
char* name_tags, |
962 |
+ |
char* seq, |
963 |
+ |
char* qual) |
964 |
+ |
{ |
965 |
+ |
if (!orig_bwt_buf || !*orig_bwt_buf) |
966 |
+ |
return false; |
967 |
+ |
|
968 |
+ |
char bwt_buf[2048]; |
969 |
+ |
strcpy(bwt_buf, orig_bwt_buf); |
970 |
+ |
// Are we still in the header region? |
971 |
+ |
if (bwt_buf[0] == '@') |
972 |
+ |
return false; |
973 |
+ |
|
974 |
+ |
//char orientation; |
975 |
+ |
/* |
976 |
+ |
char mismatches[buf_size]; |
977 |
+ |
|
978 |
+ |
//char* orientation_str = get_token((char**)&buf,"\t"); |
979 |
+ |
|
980 |
+ |
mismatches[0] = 0; |
981 |
+ |
char* mismatches_str = get_token((char**)&buf, "\t"); |
982 |
+ |
if (mismatches_str) |
983 |
+ |
strcpy(mismatches, mismatches_str); |
984 |
+ |
|
985 |
+ |
orientation = orientation_str[0]; |
986 |
+ |
text_offset = atoi(text_offset_str); |
987 |
+ |
*/ |
988 |
+ |
|
989 |
+ |
char* buf = bwt_buf; |
990 |
+ |
char* name = get_token((char**)&buf,"\t"); |
991 |
+ |
char* sam_flag_str = get_token((char**)&buf,"\t"); |
992 |
+ |
char* text_name = get_token((char**)&buf,"\t"); |
993 |
+ |
char* text_offset_str = get_token((char**)&buf,"\t"); |
994 |
+ |
const char* map_qual_str = get_token((char**)&buf,"\t"); |
995 |
+ |
char* cigar_str = get_token((char**)&buf,"\t"); |
996 |
+ |
const char* mate_ref_str = get_token((char**)&buf,"\t"); |
997 |
+ |
const char* mate_pos_str = get_token((char**)&buf,"\t"); |
998 |
+ |
const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); |
999 |
+ |
|
1000 |
+ |
const char* seq_str = get_token((char**)&buf,"\t"); |
1001 |
+ |
if (seq) |
1002 |
+ |
strcpy(seq, seq_str); |
1003 |
+ |
|
1004 |
+ |
const char* qual_str = get_token((char**)&buf,"\t"); |
1005 |
+ |
if (qual) |
1006 |
+ |
strcpy(qual, qual_str); |
1007 |
+ |
|
1008 |
+ |
if (!name || |
1009 |
+ |
!sam_flag_str || |
1010 |
+ |
!text_name || |
1011 |
+ |
!text_offset_str || |
1012 |
+ |
!map_qual_str || |
1013 |
+ |
!cigar_str || |
1014 |
+ |
!mate_ref_str || |
1015 |
+ |
!mate_pos_str || |
1016 |
+ |
!inferred_insert_sz_str || |
1017 |
+ |
!seq_str || |
1018 |
+ |
!qual_str) |
1019 |
+ |
{ |
1020 |
+ |
// truncated or malformed SAM record |
1021 |
+ |
return false; |
1022 |
+ |
} |
1023 |
+ |
|
1024 |
+ |
|
1025 |
+ |
int sam_flag = atoi(sam_flag_str); |
1026 |
+ |
int text_offset = atoi(text_offset_str); |
1027 |
+ |
|
1028 |
+ |
//############################################## |
1029 |
+ |
bool end = true; |
1030 |
+ |
unsigned int seg_offset = 0; |
1031 |
+ |
unsigned int seg_num = 0; |
1032 |
+ |
unsigned int num_segs = 0; |
1033 |
+ |
|
1034 |
+ |
// Copy the tag out of the name field before we might wipe it out |
1035 |
+ |
char* pipe = strrchr(name, '|'); |
1036 |
+ |
if (pipe) |
1037 |
+ |
{ |
1038 |
+ |
if (name_tags) |
1039 |
+ |
strcpy(name_tags, pipe); |
1040 |
+ |
|
1041 |
+ |
char* tag_buf = pipe + 1; |
1042 |
+ |
if (strchr(tag_buf, ':')) |
1043 |
+ |
{ |
1044 |
+ |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
1045 |
+ |
if (seg_num + 1 == num_segs) |
1046 |
+ |
end = true; |
1047 |
+ |
else |
1048 |
+ |
end = false; |
1049 |
+ |
} |
1050 |
+ |
|
1051 |
+ |
*pipe = 0; |
1052 |
+ |
} |
1053 |
+ |
// Stripping the slash and number following it gives the insert name |
1054 |
+ |
char* slash = strrchr(name, '/'); |
1055 |
+ |
if (strip_slash && slash) |
1056 |
+ |
*slash = 0; |
1057 |
+ |
|
1058 |
+ |
//int read_len = strlen(sequence); |
1059 |
+ |
|
1060 |
+ |
// Add this alignment to the table of hits for this half of the |
1061 |
+ |
// Bowtie map |
1062 |
+ |
|
1063 |
+ |
// Parse the text_name field to recover the splice coords |
1064 |
+ |
vector<string> toks; |
1065 |
+ |
|
1066 |
+ |
tokenize_strict(text_name, "|", toks); |
1067 |
+ |
|
1068 |
+ |
int num_extra_toks = (int)toks.size() - 6; |
1069 |
+ |
|
1070 |
+ |
if (num_extra_toks >= 0) |
1071 |
+ |
{ |
1072 |
+ |
static const uint8_t left_window_edge_field = 1; |
1073 |
+ |
static const uint8_t splice_field = 2; |
1074 |
+ |
//static const uint8_t right_window_edge_field = 3; |
1075 |
+ |
static const uint8_t junction_type_field = 4; |
1076 |
+ |
static const uint8_t strand_field = 5; |
1077 |
+ |
|
1078 |
+ |
string contig = toks[0]; |
1079 |
+ |
for (int t = 1; t <= num_extra_toks; ++t) |
1080 |
+ |
{ |
1081 |
+ |
contig += "|"; |
1082 |
+ |
contig += toks[t]; |
1083 |
+ |
} |
1084 |
+ |
|
1085 |
+ |
vector<string> splice_toks; |
1086 |
+ |
tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); |
1087 |
+ |
if (splice_toks.size() != 2) |
1088 |
+ |
{ |
1089 |
+ |
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
1090 |
+ |
//fprintf(stderr, "%s (token: %s)\n", text_name, |
1091 |
+ |
// toks[num_extra_toks + splice_field].c_str()); |
1092 |
+ |
return false; |
1093 |
+ |
} |
1094 |
+ |
|
1095 |
+ |
// |
1096 |
+ |
// check for an insertion hit |
1097 |
+ |
// |
1098 |
+ |
if(toks[num_extra_toks + junction_type_field] == "ins") |
1099 |
+ |
{ |
1100 |
+ |
int8_t spliced_read_len = strlen(seq_str); |
1101 |
+ |
/* |
1102 |
+ |
* The 0-based position of the left edge of the alignment. Note that this |
1103 |
+ |
* value may need to be futher corrected to account for the presence of |
1104 |
+ |
* of the insertion. |
1105 |
+ |
*/ |
1106 |
+ |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1107 |
+ |
uint32_t right = left + spliced_read_len - 1; |
1108 |
+ |
|
1109 |
+ |
/* |
1110 |
+ |
* The 0-based position of the last genomic sequence before the insertion |
1111 |
+ |
*/ |
1112 |
+ |
uint32_t left_splice_pos = atoi(splice_toks[0].c_str()); |
1113 |
+ |
|
1114 |
+ |
string insertedSequence = splice_toks[1]; |
1115 |
+ |
/* |
1116 |
+ |
* The 0-based position of the first genomic sequence after teh insertion |
1117 |
+ |
*/ |
1118 |
+ |
uint32_t right_splice_pos = left_splice_pos + 1; |
1119 |
+ |
if(left > left_splice_pos){ |
1120 |
+ |
/* |
1121 |
+ |
* The genomic position of the left edge of the alignment needs to be corrected |
1122 |
+ |
* If the alignment does not extend into the insertion, simply subtract the length |
1123 |
+ |
* of the inserted sequence, otherwise, just set it equal to the right edge |
1124 |
+ |
*/ |
1125 |
+ |
left = left - insertedSequence.length(); |
1126 |
+ |
if(left < right_splice_pos){ |
1127 |
+ |
left = right_splice_pos; |
1128 |
+ |
} |
1129 |
+ |
} |
1130 |
+ |
if(right > left_splice_pos){ |
1131 |
+ |
right = right - insertedSequence.length(); |
1132 |
+ |
if(right < left_splice_pos){ |
1133 |
+ |
right = left_splice_pos; |
1134 |
+ |
} |
1135 |
+ |
} |
1136 |
+ |
/* |
1137 |
+ |
* Now, right and left should be properly transformed into genomic coordinates |
1138 |
+ |
* We should be able to deduce how much the alignment matches the insertion |
1139 |
+ |
* simply based on the length of the read |
1140 |
+ |
*/ |
1141 |
+ |
int left_match_length = 0; |
1142 |
+ |
if(left <= left_splice_pos){ |
1143 |
+ |
left_match_length = left_splice_pos - left + 1; |
1144 |
+ |
} |
1145 |
+ |
int right_match_length = 0; |
1146 |
+ |
if(right >= right_splice_pos){ |
1147 |
+ |
right_match_length = right - right_splice_pos + 1; |
1148 |
+ |
} |
1149 |
+ |
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
1150 |
+ |
|
1151 |
+ |
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
1152 |
+ |
return false; |
1153 |
+ |
|
1154 |
+ |
string junction_strand = toks[num_extra_toks + strand_field]; |
1155 |
+ |
if(junction_strand != "rev" && junction_strand != "fwd"){ |
1156 |
+ |
fprintf(stderr, "Malformed insertion record\n"); |
1157 |
+ |
return false; |
1158 |
+ |
} |
1159 |
+ |
|
1160 |
+ |
char* pch = strtok( mismatches, ","); |
1161 |
+ |
unsigned char num_mismatches = 0; |
1162 |
+ |
|
1163 |
+ |
/* |
1164 |
+ |
* remember that text_offset holds the left end of the |
1165 |
+ |
* alignment, relative to the start of the contig |
1166 |
+ |
*/ |
1167 |
+ |
|
1168 |
+ |
/* |
1169 |
+ |
* The 0-based relative position of the left-most character |
1170 |
+ |
* before the insertion in the contig |
1171 |
+ |
*/ |
1172 |
+ |
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
1173 |
+ |
while (pch != NULL) |
1174 |
+ |
{ |
1175 |
+ |
char* colon = strchr(pch, ':'); |
1176 |
+ |
if (colon) |
1177 |
+ |
{ |
1178 |
+ |
*colon = 0; |
1179 |
+ |
int mismatch_pos = atoi(pch); |
1180 |
+ |
|
1181 |
+ |
/* |
1182 |
+ |
* for reversely mapped reads, |
1183 |
+ |
* find the correct mismatched position. |
1184 |
+ |
*/ |
1185 |
+ |
if (sam_flag & 0x0010){ |
1186 |
+ |
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
1187 |
+ |
} |
1188 |
+ |
|
1189 |
+ |
/* |
1190 |
+ |
* Only count mismatches outside of the insertion region |
1191 |
+ |
* If there is a mismatch within the insertion, |
1192 |
+ |
* disallow this hit |
1193 |
+ |
*/ |
1194 |
+ |
if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
1195 |
+ |
num_mismatches++; |
1196 |
+ |
}else{ |
1197 |
+ |
return false; |
1198 |
+ |
} |
1199 |
+ |
} |
1200 |
+ |
pch = strtok (NULL, ","); |
1201 |
+ |
} |
1202 |
+ |
|
1203 |
+ |
|
1204 |
+ |
vector<CigarOp> cigar; |
1205 |
+ |
cigar.push_back(CigarOp(MATCH, left_match_length)); |
1206 |
+ |
cigar.push_back(CigarOp(INS, insertion_match_length)); |
1207 |
+ |
cigar.push_back(CigarOp(MATCH, right_match_length)); |
1208 |
+ |
|
1209 |
+ |
/* |
1210 |
+ |
* For now, disallow hits that don't span |
1211 |
+ |
* the insertion. If we allow these types of hits, |
1212 |
+ |
* then long_spanning.cpp needs to be updated |
1213 |
+ |
* in order to intelligently merge these kinds |
1214 |
+ |
* of reads back together |
1215 |
+ |
* |
1216 |
+ |
* Following code has been changed to allow segment that end |
1217 |
+ |
* in an insertion |
1218 |
+ |
*/ |
1219 |
+ |
bh = create_hit(name, |
1220 |
+ |
contig, |
1221 |
+ |
left, |
1222 |
+ |
cigar, |
1223 |
+ |
sam_flag & 0x0010, //####### |
1224 |
+ |
junction_strand == "rev", |
1225 |
+ |
num_mismatches, |
1226 |
+ |
0, |
1227 |
+ |
end); |
1228 |
+ |
return true; |
1229 |
+ |
} |
1230 |
+ |
|
1231 |
+ |
else |
1232 |
+ |
{ |
1233 |
+ |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1234 |
+ |
int spliced_read_len = strlen(seq_str); |
1235 |
+ |
int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
1236 |
+ |
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
1237 |
+ |
int8_t right_splice_pos = spliced_read_len - left_splice_pos; |
1238 |
+ |
|
1239 |
+ |
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
1240 |
+ |
return false; |
1241 |
+ |
|
1242 |
+ |
if ((sam_flag & 0x0010) == 0) //###### |
1243 |
+ |
{ |
1244 |
+ |
if (left_splice_pos + seg_offset < _anchor_length){ |
1245 |
+ |
return false; |
1246 |
+ |
} |
1247 |
+ |
} |
1248 |
+ |
else |
1249 |
+ |
{ |
1250 |
+ |
if (right_splice_pos + seg_offset < _anchor_length) |
1251 |
+ |
return false; |
1252 |
+ |
} |
1253 |
+ |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
1254 |
+ |
//atoi(toks[right_window_edge_field].c_str()); |
1255 |
+ |
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
1256 |
+ |
|
1257 |
+ |
string junction_strand = toks[num_extra_toks + strand_field]; |
1258 |
+ |
if (!(junction_strand == "rev" || junction_strand == "fwd")|| |
1259 |
+ |
!(orientation == '-' || orientation == '+')) |
1260 |
+ |
{ |
1261 |
+ |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1262 |
+ |
//fprintf(stderr, "junction_strand=%s, orientation='%c'\n", |
1263 |
+ |
// junction_strand.c_str(), orientation); |
1264 |
+ |
return false; |
1265 |
+ |
} |
1266 |
+ |
|
1267 |
+ |
//vector<string> mismatch_toks; |
1268 |
+ |
char* pch = strtok (mismatches,","); |
1269 |
+ |
int mismatches_in_anchor = 0; |
1270 |
+ |
unsigned char num_mismatches = 0; |
1271 |
+ |
while (pch != NULL) |
1272 |
+ |
{ |
1273 |
+ |
char* colon = strchr(pch, ':'); |
1274 |
+ |
if (colon) |
1275 |
+ |
{ |
1276 |
+ |
*colon = 0; |
1277 |
+ |
num_mismatches++; |
1278 |
+ |
int mismatch_pos = atoi(pch); |
1279 |
+ |
if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) || |
1280 |
+ |
(orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
1281 |
+ |
mismatches_in_anchor++; |
1282 |
+ |
} |
1283 |
+ |
//mismatch_toks.push_back(pch); |
1284 |
+ |
pch = strtok (NULL, ","); |
1285 |
+ |
} |
1286 |
+ |
|
1287 |
+ |
// FIXME: we probably should exclude these hits somewhere, but this |
1288 |
+ |
// isn't the right place |
1289 |
+ |
vector<CigarOp> cigar; |
1290 |
+ |
cigar.push_back(CigarOp(MATCH, left_splice_pos)); |
1291 |
+ |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
1292 |
+ |
cigar.push_back(CigarOp(DEL, gap_len)); |
1293 |
+ |
}else{ |
1294 |
+ |
cigar.push_back(CigarOp(REF_SKIP, gap_len)); |
1295 |
+ |
} |
1296 |
+ |
cigar.push_back(CigarOp(MATCH, right_splice_pos)); |
1297 |
+ |
|
1298 |
+ |
bh = create_hit(name, |
1299 |
+ |
contig, |
1300 |
+ |
left, |
1301 |
+ |
cigar, |
1302 |
+ |
orientation == '-', |
1303 |
+ |
junction_strand == "rev", |
1304 |
+ |
num_mismatches, |
1305 |
+ |
mismatches_in_anchor, |
1306 |
+ |
end); |
1307 |
+ |
return true; |
1308 |
+ |
} |
1309 |
+ |
} |
1310 |
+ |
else |
1311 |
+ |
{ |
1312 |
+ |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1313 |
+ |
//fprintf(stderr, "%s\n", orig_bwt_buf); |
1314 |
+ |
// continue; |
1315 |
+ |
return false; |
1316 |
+ |
} |
1317 |
+ |
|
1318 |
+ |
return false; |
1319 |
+ |
} |
1320 |
|
|
1321 |
|
|
1322 |
|
bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |