libStatGen Software 1
Loading...
Searching...
No Matches
BgzfFileTypeRecovery.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifdef __ZLIB_AVAILABLE__
19
20#include "BgzfFileTypeRecovery.h"
21
22#include <stdio.h>
23#include <stdint.h>
24
25#include <sys/types.h>
26#include <sys/stat.h>
27#include <fcntl.h>
28#include <stdlib.h>
29#include <string.h>
30#include <zlib.h>
31
32#include <fstream>
33#include <iostream>
34#include <stdexcept>
35#include <vector>
36
37#pragma pack(push,1)
38
39#define debug false
40
41class RecoveryGzipHeader {
42private:
43 uint8_t m_ID1;
44 uint8_t m_ID2;
45 uint8_t m_CM;
46 uint8_t m_FLG;
47 uint32_t m_MTIME;
48 uint8_t m_XFL;
49 uint8_t m_OS;
50 uint16_t m_XLEN;
51public:
52 RecoveryGzipHeader() :
53 m_ID1(0),
54 m_ID2(0),
55 m_CM(0),
56 m_FLG(0),
57 m_MTIME(0),
58 m_XFL(0),
59 m_OS(0),
60 m_XLEN(0)
61 {;}
62
63 void defaults() {
64 m_ID1 = 31;
65 m_ID2 = 139;
66 m_CM = 8;
67 m_FLG = 4;
68 m_MTIME = 0;
69 m_XFL = 0;
70 m_OS = 255;
71 m_XLEN = 6;
72 }
73 uint8_t ID1() {return m_ID1;}
74 uint8_t ID2() {return m_ID2;}
75 uint8_t CM() {return m_CM;}
76 uint8_t FLG() {return m_FLG;}
77 uint32_t MTIME() {return m_MTIME;}
78 uint8_t XFL() {return m_XFL;}
79 uint8_t OS() {return m_OS;}
80 uint16_t XLEN() {return m_XLEN;}
81 bool sane() {
82 return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
83 }
84};
85
86class BGZFHeader : public RecoveryGzipHeader {
87private:
88 uint8_t m_SI1;
89 uint8_t m_SI2;
90 uint16_t m_SLEN; // little endian
91 uint16_t m_BSIZE; // little endian
92public:
93 BGZFHeader(
94 uint8_t m_SI1 = 'B',
95 uint8_t m_SI2 = 'C',
96 uint16_t m_SLEN = 2,
97 uint16_t m_BSIZE = 0
98 ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
99 uint8_t SI1() {return m_SI1;}
100 uint8_t SI2() {return m_SI2;}
101 uint16_t SLEN() {return m_SLEN;}
102 uint16_t BSIZE() {return m_BSIZE;}
103 bool sane() {
104 return RecoveryGzipHeader::sane() &&
105 (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
106 }
107};
108
109#pragma pack(pop)
110
111//
112// PeekaheadBuffer allows non-destructive peekahead and resyncing
113// after read errors when the underlying stream has signatures in the
114// data that allow it.
115//
116// In addition, it has a peek() capability to allow
117// the caller to look ahead in the stream to see
118// a certain number of bytes before actually consuming them.
119//
120// The intent is that this class behave as something of a poor
121// man's FIFO - with the cost of buffer movement when data is removed.
122//
123// This is far from ideal, but we basically are moving data around
124// when allowing arbitrary peekahead regardless.
125//
126// The basis for the design is the fact that most read calls to
127// various streams at best allow a single character to be peeked
128// at, and secondly, do not allow for recovery after an underfling
129// framing error occurs.
130//
131// That is, getchar()/putchar/ungetchar() support a single byte
132// peek. This may be fine for simply parsing applications, but here
133// we need to look at blocks up to 64K or more in size to search
134// for signatures while re-synchronizing on the underlying stream.
135//
136class PeekaheadBuffer : public std::vector<uint8_t> {
137
138protected:
139 ssize_t m_startPosition; // start of fresh data
140
141public:
142 enum ReturnCode {
143 endOfFile = -1,
144 reSync = 0,
145 ok = 1
146 };
147
148 ssize_t startPosition() {return m_startPosition;}
149
150private:
151 //
152 // when remaining data is 1/8 the size of the full
153 // buffer, shift it back down to the start.
154 //
155 // for use by read(), which will consume data from the buffer.
156 //
157 void shiftData() {
158 if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
159 erase(begin(), begin() + m_startPosition);
160 m_startPosition = 0;
161 }
162 }
163 // called when read reports an error for some
164 // reason -
165 virtual ReturnCode sync();
166public:
167 PeekaheadBuffer();
168 virtual ~PeekaheadBuffer();
169
170 // return the amount of unused data:
171 ssize_t dataRemaining();
172
173 //
174 // overload size() to throw an exception - too confusing otherwise
175// size_t size() {abort();}
176
177 //
178 // just populate data in buffer from stream - not generic
179 //
180 // XXX note that it simply ensures that count bytes of data
181 // are actually loaded into the buffer - if that amount of
182 // data (or more) is present, this call is a NOP.
183 //
184 virtual ReturnCode readahead(ssize_t count) = 0;
185
186 // read is generic.
187 // remove data from our buffer - call non-generic readahead to populate data.
188 ReturnCode read(uint8_t *buffer, ssize_t count) {
189 ReturnCode rc;
190
191 rc = readahead(count);
192
193 if(rc == ok) {
194 uint8_t *src = &(*begin()) + m_startPosition;
195 uint8_t *dest = buffer;
196
197 memcpy(dest, src, count);
198
199 m_startPosition += count; // consume data
200
201 // recover space if wasting too much:
202 shiftData();
203 } else if(rc == reSync) {
204 // peek puked - CRC error, other errors, see if we can sync forwards
205 return reSync;
206 } else {
207 // failed to get needed data - premature EOF, I guess
208 return endOfFile;
209 }
210
211 return ok;
212 }
213
214};
215
216PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
217{
218}
219
220PeekaheadBuffer::~PeekaheadBuffer()
221{
222}
223
224PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
225 clear();
226 return ok;
227}
228
229ssize_t PeekaheadBuffer::dataRemaining()
230{
231 return std::vector<uint8_t>::size() - m_startPosition;
232}
233
234
235// peekahead buffered file reader class
236class FileReader : public PeekaheadBuffer {
237 FILE *m_stream;
238public:
239 FileReader();
240 ~FileReader();
241 FileReader(FILE *stream);
242 PeekaheadBuffer::ReturnCode readahead(ssize_t count);
243 FILE *stream() {return m_stream;}
244 bool eof() {return m_stream ? feof(m_stream) : false;}
245};
246
247FileReader::FileReader()
248{
249 m_stream = NULL;
250}
251
252FileReader::FileReader(FILE *stream) : m_stream(stream)
253{
254}
255
256FileReader::~FileReader()
257{
258 fclose(m_stream);
259 m_stream = NULL;
260}
261
262//
263// fill buffer until we have count bytes of valid
264// data.
265//
266// need to detect error and eof and return appropriate values.
267//
268PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
269{
270 uint8_t buffer[4096];
271 while(dataRemaining() < count) {
272 int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
273 if(bytesRead==0) {
274 if(ferror(m_stream)) {
275 return reSync;
276 }
277 // ain't getting no more data...
278 return endOfFile;
279 }
280#if 0
281 fprintf(stderr, "\n\n");
282 int possible = -1;
283 for(int i=0;i<bytesRead;i+=16) {
284 fprintf(stderr,"%08x: ", i);
285 for(int j=0;j<16;j++) {
286 if(buffer[i+j]==31 && buffer[i+j+1]==139) {
287 possible = i+j;
288 }
289 fprintf(stderr,"%02x ", buffer[i+j]);
290 }
291 fprintf(stderr, "\n");
292 }
293 if(possible>0) {
294 fprintf(stderr,"possible signature at %08x\n", possible);
295 }
296#endif
297 insert(end(), &buffer[0], &buffer[0] + bytesRead);
298 }
299 return ok;
300}
301
302class BGZFReader : public PeekaheadBuffer {
303 FileReader m_fileReader;
304
305public:
306
307 BGZFReader(FILE *stream) : m_fileReader(stream) {;}
308
309 PeekaheadBuffer::ReturnCode readahead(ssize_t count);
310
311 //
312 // This will be reading data, and needs to return EOF, etc
313 //
314 ReturnCode sync() {
315 // my internal data is now bad, so we'll scan ahead seeing
316 // if we can find a good header
317 clear();
318 PeekaheadBuffer::ReturnCode rc;
319 while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
320 BGZFHeader *header;
321 if(rc==endOfFile) return rc;
322 // a rc==reSync is ok provided readahead still ensures that header is present
323 void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
324 header = (BGZFHeader *) src;
325 if(header->sane()) {
326 if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
327 return reSync; // tell caller they need to sync up
328 }
329 // consume a byte, then see if we're at a valid block header
330 uint8_t throwAwayBuffer;
331 rc = m_fileReader.read(&throwAwayBuffer, 1);
332 }
333 return rc;
334 }
335 FILE *stream() {return m_fileReader.stream();}
336
337 bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
338
339};
340
341PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
342{
343 BGZFHeader header;
344 // size of inflateBuffer can be determined from ISIZE, I think
345 uint8_t inflateBuffer[64*1024];
346 uint8_t gzipBuffer[64*1024+1];
347
348 while(dataRemaining() < count) {
349 static int loopCount = 0;
350
351 if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
352
353 // here we actually read data:
354 // read what should be the header
355 // verify the header
356 // read the remainder of the block
357 // check the CRC validity or perhaps just call unzip
358 //
359 // XXX the sizeof(header) is wrong:
360 PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
361
362 if(rc == endOfFile) {
363 return endOfFile;
364 }
365
366 // if we have a bad header, start looking forward for a good one,
367 if(!header.sane()) {
368 // sync does not consume the next good header, it simply syncs()
369 // the data stream to the next believed good BGZF header:
370 if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
371 rc = sync();
372 //
373 // even though we can now decompress, we need to tell the caller
374 // what is up before they call for more data (caller needs to
375 // sync its own record stream):
376 return rc;
377 }
378
379 // Read the remainder of the block.
380 // BSIZE is size of the entire block - 1, so compensate.
381 rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
382
383 if(rc == reSync) {
384 if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
385 sync();
386 return reSync;
387 }
388
389 //
390 // we read a header, but our attempt to read more data ended early,
391 // so best to just return EOF
392 //
393 if(rc == endOfFile) {
394 return rc;
395 }
396
397 PeekaheadBuffer::ReturnCode bgzf_rc = ok;
398 // zs.opaque is set when zalloc is NULL
399 //
400 // NB: zlib inflateInit2() has valgrind errors
401 // in versions <1.2.4 - those can be ignored.
402 //
403 z_stream zs;
404 zs.zalloc = NULL;
405 zs.zfree = NULL;
406 zs.next_in = gzipBuffer;
407 zs.avail_in = header.BSIZE() - 16; // XXX need to check docs for inflate
408 zs.next_out = inflateBuffer;
409 zs.avail_out = sizeof(inflateBuffer);
410
411 // -15 --> raw inflate - don't look for gzip or zlib header
412 // This can be optimized - inflateInit2 does a malloc of
413 // approximately 10K (sizeof(inflate_state))
414 if(inflateInit2(&zs, -15) != Z_OK) {
415 bgzf_rc = reSync;
416 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
417 // XXX fatal?
418 }
419 if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
420 bgzf_rc = reSync;
421 if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
422 }
423
424 if(bgzf_rc == ok) {
425 if(inflateEnd(&zs) == Z_OK) {
426 // do something with zs.total_out
427 if(debug) std::cout << "hey, got data! zs.total_out == " << zs.total_out << "\n";
428
429 // append the newly decompressed data
430 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
431 } else {
432 // seems exceptionall unlikely, but check this error case too
433 bgzf_rc = reSync;
434 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
435 // XXX fatal?
436 }
437 }
438
439 if(bgzf_rc != ok) {
440 inflateEnd(&zs);
441 sync();
442 return bgzf_rc;
443 }
444
445 // may need to get more data - loop back till all is complete
446 }
447
448 return ok;
449
450}
451
452
453#if 0
454void testBGZFBuffer()
455{
456 BGZFReader b(stdin);
457 std::vector<uint8_t>::iterator position;
458 BGZFReader::ReturnCode rc;
459
460 std::cout << "size = " << b.dataRemaining() << "\n";
461
462 //
463 // this should:
464 // decompress a BGZF block, populating the buffer with
465 // unzipped data, possibly returning a BGZFBuffer::ReturnCode of
466 // resync if it turns out the BGZF data was interrupted by bad
467 // CRC checks.
468 //
469 rc = b.readahead(64);
470 std::cout << "rc = " << rc << " - expect ok (1)\n";
471 std::cout << "size (expect 64) = " << b.size() << "\n";
472}
473
474
475int main(int argc, const char **argv)
476{
477 testBGZFBuffer();
478}
479#endif
480
481
482
483int BgzfFileTypeRecovery::close()
484{
485 if(bgzfReader) delete bgzfReader;
486 bgzfReader = NULL;
487 return true;
488}
489
490
491BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
492{
493 if(tolower(mode[0])=='r') {
494 FILE *f = fopen(filename,"r");
495 bgzfReader = new BGZFReader(f);
496 } else {
497 // die for now
498 if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
499 close();
500 }
501}
502
503//
504// Why is this ever called?
505//
506bool BgzfFileTypeRecovery::operator == (void * rhs)
507{
508 throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
509 return false;
510}
511
512bool BgzfFileTypeRecovery::operator != (void * rhs)
513{
514 throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
515 return false;
516}
517
518int BgzfFileTypeRecovery::eof()
519{
520 return bgzfReader->eof();
521}
522
523unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
524{
525 // currently unsupported
526 return 0;
527}
528
529int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
530{
531
532 if(bgzfReader == NULL) {
533 return 0;
534 }
535
536 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
537 // endOfFile = -1,
538// reSync = 0,
539// ok = 1
540 switch(rc) {
541 case PeekaheadBuffer::endOfFile:
542 // set a flag?
543 return 0;
544 case PeekaheadBuffer::reSync:
545 // we could encode more info in the exception message here:
546 if(debug) std::cerr << "throwing BGZF sync exception\n";
547 throw std::runtime_error("BGZF stream resync");
548 case PeekaheadBuffer::ok:
549 //
550 // in bgzfReader, we always are ensured we
551 // get the full amount of the read, otherwise
552 // an error is thrown.
553 //
554 return size;
555 }
556 // NOTREACHED
557 return 0;
558}
559
560int64_t BgzfFileTypeRecovery::tell()
561{
562 // currently unsupported
563 return 0;
564}
565
566bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
567{
568 // currently unsupported
569 return 0;
570}
571
572
573bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
574{
575 //
576 // creep along a byte at a time, checking for signature.
577 //
578 // possibly slow. should only need to scan ahead < 64K bytes
579 // or so, however, so should recover in "reasonable" time.
580 //
581 while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
582 char ch;
583 void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
584
585 //
586 // readahead ensures we have 'length' bytes of
587 // data to check that is valid in the buffer.
588 //
589 if((*checkSignature)(src)) return true;
590 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) &ch, 1);
591 if(rc!=PeekaheadBuffer::ok) return false;
592 // we consumed a byte, so go back to top of loop,
593 // resume filling buffer (if need be) and re-check
594 }
595
596
597 return false;
598}
599
600#endif