libStatGen Software 1
Loading...
Searching...
No Matches
SamQuerySeqWithRefHelper.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <stdexcept>
19
20#include "SamQuerySeqWithRefHelper.h"
21#include "BaseUtilities.h"
22#include "SamFlag.h"
23
24SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
25 GenomeSequence& refSequence,
26 bool forward)
27 : myRecord(record),
28 myRefSequence(refSequence),
29 myCigar(NULL),
30 myStartOfReadOnRefIndex(INVALID_GENOME_INDEX),
31 myQueryIndex(0),
32 myForward(forward)
33{
34 myCigar = myRecord.getCigarInfo();
35 myStartOfReadOnRefIndex =
36 refSequence.getGenomePosition(myRecord.getReferenceName());
37
38 if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
39 {
40 // This reference name was found in the reference file, so
41 // add the start position.
42 myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
43 }
44
45 if(!forward)
46 {
47 myQueryIndex = myRecord.getReadLength() - 1;
48 }
49}
50
51
52SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
53{
54}
55
56
57
59{
60 myCigar = myRecord.getCigarInfo();
61 if(myCigar == NULL)
62 {
63 // Failed to get Cigar.
64 return(false);
65 }
66
67 // Get where the position of where this read starts as mapped to the
68 // reference.
69 myStartOfReadOnRefIndex =
70 myRefSequence.getGenomePosition(myRecord.getReferenceName());
71
72 if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
73 {
74 // This reference name was found in the reference file, so
75 // add the start position.
76 myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
77 }
78
79 myForward = forward;
80
81 if(myForward)
82 {
83 myQueryIndex = 0;
84 }
85 else
86 {
87 // reverse, so start at the last entry.
88 myQueryIndex = myRecord.getReadLength() - 1;
89 }
90 return(true);
91}
92
93
94// Returns information for the next position where the query and the
95// reference match or mismatch. To be a match or mismatch, both the query
96// and reference must have a base that is not 'N'.
97// This means:
98// insertions and deletions are not mismatches or matches.
99// 'N' bases are not matches or mismatches
100// Returns true if an entry was found, false if there are no more matches or
101// mismatches.
103{
104 // Check whether or not this read is mapped.
105 // If the read is not mapped, return no matches.
106 if(!SamFlag::isMapped(myRecord.getFlag()))
107 {
108 // Not mapped.
109 return(false);
110 }
111
112 // Check that the Cigar is set.
113 if(myCigar == NULL)
114 {
115 // Error.
116 throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
117 return(false);
118 }
119
120 // If myStartOfReadOnRefIndex is the default (unset) value, then
121 // the reference was not found, so return false, no matches/mismatches.
122 if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX)
123 {
124 // This reference name was not found in the reference file, so just
125 // return no matches/mismatches.
126 return(false);
127 }
128
129
130 // Repull the read length from the record to check just in case the
131 // record has changed length.
132 // Loop until a match or mismatch is found as long as query index
133 // is still within the read (Loop is broken by a return).
134 while((myQueryIndex < myRecord.getReadLength()) &&
135 (myQueryIndex >= 0))
136 {
137 // Still more bases, look for a match/mismatch.
138
139 // Get the reference offset for this read position.
140 int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
141 if(refOffset == Cigar::INDEX_NA)
142 {
143 // This is either a softclip or an insertion
144 // which do not count as a match or a mismatch, so
145 // go to the next index.
146 nextIndex();
147 continue;
148 }
149
150 // Both the reference and the read have a base, so get the bases.
151 char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
152 char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
153
154 // If either the read or the reference bases are unknown, then
155 // it does not count as a match or a mismatch.
156 if(BaseUtilities::isAmbiguous(readBase) ||
158 {
159 // Either the reference base or the read base are unknown,
160 // so skip this position.
161 nextIndex();
162 continue;
163 }
164
165 // Both the read & the reference have a known base, so it is either
166 // a match or a mismatch.
167 matchMismatchInfo.setQueryIndex(myQueryIndex);
168
169 // Check if they are equal.
170 if(BaseUtilities::areEqual(readBase, refBase))
171 {
172 // Match.
173 matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
174 // Increment the query index to the next position.
175 nextIndex();
176 return(true);
177 }
178 else
179 {
180 // Mismatch
181 matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
182 // Increment the query index to the next position.
183 nextIndex();
184 return(true);
185 }
186 }
187
188 // No matches or mismatches were found, so return false.
189 return(false);
190}
191
192
193void SamQuerySeqWithRefIter::nextIndex()
194{
195 if(myForward)
196 {
197 ++myQueryIndex;
198 }
199 else
200 {
201 --myQueryIndex;
202 }
203}
204
205
206SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
207 : myType(UNKNOWN),
208 myQueryIndex(0)
209{
210}
211
212
213SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
214{
215}
216
217
222
223
225{
226 return(myQueryIndex);
227}
228
229
231{
232 myType = newType;
233}
234
235
237{
238 myQueryIndex = queryIndex;
239}
240
241
242///////////////////////////////////////////////////////////////////////////
243void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
244 int32_t seq0BasedPos,
245 Cigar& cigar,
246 const char* referenceName,
247 const GenomeSequence& refSequence,
248 std::string& updatedSeq)
249{
250 updatedSeq = currentSeq;
251
252 int32_t seqLength = updatedSeq.length();
253 int32_t queryIndex = 0;
254
255 uint32_t startOfReadOnRefIndex =
256 refSequence.getGenomePosition(referenceName);
257
258 if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
259 {
260 // This reference name was not found in the reference file, so just
261 // return.
262 return;
263 }
264 startOfReadOnRefIndex += seq0BasedPos;
265
266 // Loop until the entire sequence has been updated.
267 while(queryIndex < seqLength)
268 {
269 // Still more bases, look for matches.
270
271 // Get the reference offset for this read position.
272 int32_t refOffset = cigar.getRefOffset(queryIndex);
273 if(refOffset != Cigar::INDEX_NA)
274 {
275 // Both the reference and the read have a base, so get the bases.
276 char readBase = currentSeq[queryIndex];
277 char refBase = refSequence[startOfReadOnRefIndex + refOffset];
278
279 // If neither base is unknown and they are the same, count it
280 // as a match.
281 if(!BaseUtilities::isAmbiguous(readBase) &&
282 !BaseUtilities::isAmbiguous(refBase) &&
283 (BaseUtilities::areEqual(readBase, refBase)))
284 {
285 // Match.
286 updatedSeq[queryIndex] = '=';
287 }
288 }
289 // Increment the query index to the next position.
290 ++queryIndex;
291 continue;
292 }
293}
294
295
296void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
297 int32_t seq0BasedPos,
298 Cigar& cigar,
299 const char* referenceName,
300 const GenomeSequence& refSequence,
301 std::string& updatedSeq)
302{
303 updatedSeq = currentSeq;
304
305 int32_t seqLength = updatedSeq.length();
306 int32_t queryIndex = 0;
307
308 uint32_t startOfReadOnRefIndex =
309 refSequence.getGenomePosition(referenceName);
310
311 if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
312 {
313 // This reference name was not found in the reference file, so just
314 // return.
315 return;
316 }
317 startOfReadOnRefIndex += seq0BasedPos;
318
319 // Loop until the entire sequence has been updated.
320 while(queryIndex < seqLength)
321 {
322 // Still more bases, look for matches.
323
324 // Get the reference offset for this read position.
325 int32_t refOffset = cigar.getRefOffset(queryIndex);
326 if(refOffset != Cigar::INDEX_NA)
327 {
328 // Both the reference and the read have a base, so get the bases.
329 char readBase = currentSeq[queryIndex];
330 char refBase = refSequence[startOfReadOnRefIndex + refOffset];
331
332 // If the bases are equal, set the sequence to the reference
333 // base. (Skips the check for ambiguous to catch a case where
334 // ambiguous had been converted to a '=', and if both are ambiguous,
335 // it will still be set to ambiguous.)
336 if(BaseUtilities::areEqual(readBase, refBase))
337 {
338 // Match.
339 updatedSeq[queryIndex] = refBase;
340 }
341 }
342
343 // Increment the query index to the next position.
344 ++queryIndex;
345 continue;
346 }
347}
static bool isAmbiguous(char base)
Returns whether or not the specified bases is an indicator for ambiguity.
static bool areEqual(char base1, char base2)
Returns whether or not two bases are equal (case insensitive), if one of the bases is '=',...
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition Cigar.h:84
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition Cigar.h:492
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition Cigar.cpp:187
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
bool reset(bool forward=true)
Reset to start at the beginning of the record.
bool getNextMatchMismatch(SamSingleBaseMatchInfo &matchMismatchInfo)
Returns information for the next position where the query and the reference match or mismatch.
static void seqWithoutEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence converting '=' to the appropriate base using the reference.
static void seqWithEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence with '=' in any position where the sequence matches the reference.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition SamRecord.h:52
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
@ NONE
Leave the sequence as is.
Definition SamRecord.h:58
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
uint16_t getFlag()
Get the flag (FLAG).
int32_t getReadLength()
Get the length of the read.
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
This class contains the match/mismatch information between the reference and a read for a single base...
void setQueryIndex(int32_t queryIndex)
Set the query index for this object.
void setType(Type newType)
Set the type (match/mismatch/unkown) for this object.
Type
More types can be added later as needed.
int32_t getQueryIndex()
Get the query index for this object.
Type getType()
Get the type (match/mismatch/unknown) for this object.