1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | #include <sstream> |
19 | #include "hddm_s.hpp" |
20 | |
21 | #ifndef _FILE_OFFSET_BITS64 |
22 | # define _FILE_OFFSET_BITS64 64 |
23 | #endif |
24 | |
25 | using namespace hddm_s; |
26 | |
27 | static int tags_match(const std::string &a, const std::string &b) |
28 | { |
29 | if (a == b) { |
30 | return true; |
31 | } |
32 | else { |
33 | int len = a.length(); |
34 | int ia=0; |
35 | int ib=0; |
36 | for (; a[ia] == b[ib]; ++ia, ++ib, --len) {} |
37 | for (; a[ia] == ' '; ++ia, --len) {} |
38 | for (; a[ia] == '/'; ++ia, --len) {} |
39 | for (; b[ib] == ' '; ++ib) {} |
40 | for (; b[ib] == '/'; ++ib) {} |
41 | return (a.substr(ia) == b.substr(ib)); |
42 | } |
43 | } |
44 | |
45 | streamposition::streamposition() |
46 | : block_start(), block_offset(), block_status() {} |
47 | |
48 | streamposition::streamposition(uint64_t start, uint32_t offset, uint32_t status) |
49 | : block_start(start), block_offset(offset), block_status(status) {} |
50 | |
51 | istream::istream(std::istream &src) |
52 | : m_xstr(0), |
53 | m_istr(src), |
54 | m_xcmp(0), |
55 | m_xraw(0), |
56 | m_status_bits(0) |
57 | { |
58 | char hdr[10]; |
59 | src.getline(hdr,7); |
60 | m_documentString = hdr; |
61 | if (m_documentString != "<HDDM ") { |
62 | throw std::runtime_error("hddm_s::istream::istream error - invalid hddm header"); |
63 | } |
64 | src.clear(); |
65 | std::string line; |
66 | while (std::getline(src,line).good()) { |
67 | m_documentString += line + "\n"; |
68 | if (line == "</HDDM>") { |
69 | break; |
70 | } |
71 | } |
72 | if (src.bad()) { |
73 | throw std::runtime_error("hddm_s::istream::istream error - invalid hddm header"); |
74 | } |
75 | m_genome.m_tagname = "HDDM"; |
76 | m_genome.m_sequence = synthesize(m_documentString,0,HDDM::DocumentString(),0); |
77 | m_event_buffer = new char[m_event_buffer_size = 100000]; |
78 | m_sbuf = new istreambuffer(m_event_buffer,m_event_buffer_size); |
79 | configure_streambufs(); |
80 | m_next_event_size = 0; |
81 | m_events_to_skip = 0; |
82 | m_records_read = 0; |
83 | m_bytes_read = 0; |
84 | } |
85 | |
86 | istream::~istream() { |
87 | if (m_xraw) { |
88 | m_istr.rdbuf(m_xraw); |
89 | } |
90 | if (m_xcmp) { |
91 | delete m_xcmp; |
92 | } |
93 | if (m_xstr) { |
94 | delete m_xstr; |
95 | } |
96 | if (m_sbuf) { |
97 | delete m_sbuf; |
98 | } |
99 | delete [] m_event_buffer; |
100 | } |
101 | |
102 | streamposition istream::getPosition() const { |
103 | streamposition pos; |
104 | pos.block_status = m_status_bits; |
105 | if (m_status_bits & (k_bz2_compression | k_z_compression)) { |
106 | if (m_status_bits & k_can_reposition) { |
107 | pos.block_start = m_xraw->pubseekoff(0,std::ios_base::cur, |
108 | std::ios_base::out); |
109 | if (m_status_bits & k_bz2_compression) { |
110 | pos.block_offset = dynamic_cast<xstream::bz::istreambuf*>(m_xcmp) |
111 | ->get_block_offset(); |
112 | pos.block_start -= dynamic_cast<xstream::bz::istreambuf*>(m_xcmp) |
113 | ->get_block_buffered(); |
114 | } |
115 | else { |
116 | pos.block_offset = dynamic_cast<xstream::z::istreambuf*>(m_xcmp) |
117 | ->get_block_offset(); |
118 | pos.block_start -= dynamic_cast<xstream::z::istreambuf*>(m_xcmp) |
119 | ->get_block_buffered(); |
120 | } |
121 | if (m_next_event_size > 0) |
122 | pos.block_offset -= std::streamoff(4); |
123 | } |
124 | else { |
125 | throw std::runtime_error("hddm_s::istream::getPosition error - " |
126 | "old-format hddm input file does not support repositioning."); |
127 | } |
128 | } |
129 | else { |
130 | pos.block_start = m_istr.tellg(); |
131 | if (m_next_event_size > 0) |
132 | pos.block_start -= std::streamoff(4); |
133 | pos.block_offset = 0; |
134 | } |
135 | return pos; |
136 | } |
137 | |
138 | void istream::setPosition(const streamposition &pos) { |
139 | m_status_bits = pos.block_status; |
140 | if (m_status_bits & (k_bz2_compression | k_z_compression)) { |
| |
141 | if ((m_status_bits & k_can_reposition) == 0) { |
| |
142 | throw std::runtime_error("hddm_s::istream::setPosition error - " |
143 | "old-format hddm input file does not support repositioning."); |
144 | } |
145 | if (m_xraw == 0 || pos.block_start != getPosition().block_start || |
146 | pos.block_offset < getPosition().block_offset) |
147 | { |
148 | m_bytes_read = 0; |
149 | configure_streambufs(); |
| 3 | | Calling 'istream::configure_streambufs' | |
|
| 7 | | Returning from 'istream::configure_streambufs' | |
|
150 | m_xraw->pubseekoff(pos.block_start,std::ios_base::beg, |
| 8 | | Called C++ object pointer is null |
|
151 | std::ios_base::in); |
152 | m_next_event_size = 0; |
153 | } |
154 | int advance; |
155 | while ((advance = pos.block_offset - getPosition().block_offset)) { |
156 | char tmpbuf[advance]; |
157 | m_xcmp->sgetn(tmpbuf, advance); |
158 | } |
159 | } |
160 | else if (pos.block_start != getPosition().block_start) { |
161 | m_istr.seekg(pos.block_start); |
162 | m_next_event_size = 0; |
163 | } |
164 | } |
165 | |
166 | void istream::configure_streambufs() { |
167 | if (m_xstr == 0) { |
| |
168 | m_xstr = new xstream::xdr::istream(m_sbuf); |
169 | } |
170 | if (m_xraw == 0 && (m_status_bits & k_z_compression) != 0) { |
171 | |
172 | m_xraw = m_istr.rdbuf(); |
173 | m_xcmp = new xstream::z::istreambuf(m_xraw); |
174 | m_istr.rdbuf(m_xcmp); |
175 | } |
176 | else if (m_xraw == 0 && (m_status_bits & k_bz2_compression) != 0) { |
177 | |
178 | m_xraw = m_istr.rdbuf(); |
179 | m_xcmp = new xstream::bz::istreambuf(m_xraw); |
180 | m_istr.rdbuf(m_xcmp); |
181 | } |
182 | else if (m_xraw == 0 && (m_status_bits & k_bits_compression) != 0) { |
183 | throw std::runtime_error("hddm_s::istream::configure_streambufs error - " |
184 | "unrecognized compression flag requested."); |
185 | } |
186 | else if (m_xraw != 0 && m_bytes_read == 0) { |
| |
187 | m_istr.rdbuf(m_xraw); |
188 | delete m_xcmp; |
189 | m_xcmp = 0; |
190 | m_xraw = 0; |
| 6 | | Null pointer value stored to field 'm_xraw' | |
|
191 | configure_streambufs(); |
192 | } |
193 | else if (m_xraw != 0) { |
194 | throw std::runtime_error("hddm_s::istream::configure_streambufs error" |
195 | " - cannot reconfigure compression in mid-stream!"); |
196 | } |
197 | } |
198 | |
199 | istream &istream::operator>>(HDDM &record) { |
200 | if (m_next_event_size == 0) { |
201 | m_istr.read(m_event_buffer,4); |
202 | m_bytes_read += m_istr.gcount(); |
203 | if (!m_istr.good()) { |
204 | throw std::runtime_error("hddm_s::istream::operator>> error - " |
205 | "attempt to read past end of file!"); |
206 | } |
207 | m_sbuf->reset(); |
208 | *m_xstr >> m_next_event_size; |
209 | return *this >> record; |
210 | } |
211 | else if (m_next_event_size == 1) { |
212 | m_istr.read(m_event_buffer+4,4); |
213 | m_bytes_read += m_istr.gcount(); |
214 | if (!m_istr.good()) { |
215 | throw std::runtime_error("hddm_s::istream::operator>> error -" |
216 | " read error on token input!"); |
217 | } |
218 | int size; |
219 | *m_xstr >> size; |
220 | m_istr.read(m_event_buffer+8,size); |
221 | m_bytes_read += m_istr.gcount(); |
222 | if (!m_istr.good()) { |
223 | throw std::runtime_error("hddm_s::istream::operator>> error -" |
224 | " read error on token input!"); |
225 | } |
226 | int format, flags; |
227 | *m_xstr >> format >> flags; |
228 | if (format != 0) { |
229 | throw std::runtime_error("hddm_s::istream::operator>> error - " |
230 | "unsupported compression format!"); |
231 | } |
232 | else if (flags != m_status_bits) { |
233 | int oldcmp = m_status_bits & k_bits_compression; |
234 | int newcmp = flags & k_bits_compression; |
235 | m_status_bits = flags; |
236 | if (oldcmp != newcmp) { |
237 | configure_streambufs(); |
238 | } |
239 | } |
240 | m_next_event_size = 0; |
241 | return *this >> record; |
242 | } |
243 | else if (m_next_event_size+8 > m_event_buffer_size) { |
244 | delete m_xstr; |
245 | delete m_sbuf; |
246 | char *newbuf = new char[m_event_buffer_size = m_next_event_size+1000]; |
247 | m_sbuf = new istreambuffer(newbuf, m_event_buffer_size); |
248 | m_xstr = new xstream::xdr::istream(m_sbuf); |
249 | memcpy(newbuf,m_event_buffer,4); |
250 | delete [] m_event_buffer; |
251 | m_event_buffer = newbuf; |
252 | } |
253 | |
254 | m_istr.read(m_event_buffer+4,m_next_event_size); |
255 | m_bytes_read += m_istr.gcount(); |
256 | m_records_read++; |
257 | if (!m_istr.good()) { |
258 | throw std::runtime_error("hddm_s::istream::operator>> error -" |
259 | " read error in mid-record!"); |
260 | } |
261 | if ((m_status_bits & k_crc32_integrity) != 0) { |
262 | unsigned int recorded_crc; |
263 | char crcbuf[10]; |
264 | istreambuffer sbuf(crcbuf,10); |
265 | xstream::xdr::istream xstr(&sbuf); |
266 | m_istr.read(crcbuf,4); |
267 | m_bytes_read += m_istr.gcount(); |
268 | xstr >> recorded_crc; |
269 | xstream::digest::crc32 crc; |
270 | std::ostream out(&crc); |
271 | out.write(m_event_buffer,m_next_event_size+4); |
272 | out.flush(); |
273 | if (crc.digest() != recorded_crc) { |
274 | char errmsg[] = |
275 | "WARNING: crc data integrity check failed" |
276 | " on hddm_s input stream!" |
277 | "\nThis may be the result of a bug in the" |
278 | " xstream library if you are analyzing a data" |
279 | " file that was generated by code prior to svn" |
280 | " rev 18530.\nIf this concerns you, regenerate" |
281 | " using a newer build of the sim-recon tools" |
282 | " and it should go away.\n"; |
283 | if ((m_status_bits & 0x02) == 0) { |
284 | std::cerr << errmsg << std::endl; |
285 | m_status_bits |= 0x02; |
286 | } |
287 | |
288 | |
289 | } |
290 | } |
291 | |
292 | if (m_events_to_skip) { |
293 | --m_events_to_skip; |
294 | m_next_event_size = 0; |
295 | return *this >> record; |
296 | } |
297 | m_sbuf->reset(); |
298 | m_sequencing = 0; |
299 | m_codon = &m_genome; |
300 | *this >> (streamable&)record; |
301 | m_istr.read(m_event_buffer,4); |
302 | m_bytes_read += m_istr.gcount(); |
303 | if (m_istr.eof()) { |
304 | m_next_event_size = 0; |
305 | } |
306 | else if (!m_istr.good()) { |
307 | throw std::runtime_error("hddm_s::istream::operator>> error - " |
308 | "read error on event size!"); |
309 | } |
310 | else { |
311 | m_sbuf->reset(); |
312 | *m_xstr >> m_next_event_size; |
313 | } |
314 | return *this; |
315 | } |
316 | |
317 | ostream::ostream(std::ostream &src) |
318 | : m_xstr(0), |
319 | m_ostr(src), |
320 | m_xcmp(0), |
321 | m_xraw(0), |
322 | m_status_bits(k_default_status) |
323 | { |
324 | m_ostr << HDDM::DocumentString(); |
325 | if (!m_ostr.good()) { |
326 | throw std::runtime_error("hddm_s::ostream::ostream(ostream) " |
327 | "error - write error on header output!"); |
328 | } |
329 | m_event_buffer = new char[m_event_buffer_size = 100000]; |
330 | m_sbuf = new ostreambuffer(m_event_buffer,m_event_buffer_size); |
331 | configure_streambufs(); |
332 | m_records_written = 0; |
333 | m_bytes_written = 0; |
334 | } |
335 | |
336 | ostream::~ostream() { |
337 | if (m_xstr) { |
338 | delete m_xstr; |
339 | } |
340 | if (m_xraw) { |
341 | m_ostr.flush(); |
342 | m_ostr.rdbuf(m_xraw); |
343 | } |
344 | if (m_xcmp) { |
345 | delete m_xcmp; |
346 | } |
347 | if (m_sbuf) { |
348 | delete m_sbuf; |
349 | } |
350 | delete [] m_event_buffer; |
351 | } |
352 | |
353 | void ostream::setCompression(int flags) { |
354 | if ((flags ^ m_status_bits) & k_bits_compression) { |
355 | if ((m_status_bits & k_bits_compression) != k_no_compression) { |
356 | std::cerr << "hddm_s::ostream::setCompression warning - " |
357 | << "compression already enabled, cannot change." |
358 | << std::endl; |
359 | return; |
360 | } |
361 | m_status_bits &= ~k_bits_compression; |
362 | m_status_bits |= flags; |
363 | m_status_bits |= k_can_reposition; |
364 | m_sbuf->reset(); |
365 | *m_xstr << 1 << 8 << 0 << m_status_bits; |
366 | m_ostr.write(m_sbuf->getbuf(),m_sbuf->size()); |
367 | if (!m_ostr.good()) { |
368 | throw std::runtime_error("hddm_s::ostream::setCompression" |
369 | " error - write error on token output!"); |
370 | } |
371 | configure_streambufs(); |
372 | } |
373 | } |
374 | |
375 | void ostream::setIntegrityChecks(int flags) { |
376 | if ((flags ^ m_status_bits) & k_bits_integrity) { |
377 | m_status_bits &= ~k_bits_integrity; |
378 | m_status_bits |= flags; |
379 | m_sbuf->reset(); |
380 | *m_xstr << 1 << 8 << 0 << m_status_bits; |
381 | m_ostr.write(m_sbuf->getbuf(),m_sbuf->size()); |
382 | if (!m_ostr.good()) { |
383 | throw std::runtime_error("hddm_s::ostream::setIntegrityChecks error - " |
384 | "write error on token output!"); |
385 | } |
386 | } |
387 | } |
388 | |
389 | streamposition ostream::getPosition() const { |
390 | streamposition pos; |
391 | pos.block_status = m_status_bits; |
392 | if (m_status_bits & k_bz2_compression) { |
393 | pos.block_start = m_xraw->pubseekoff(0,std::ios_base::cur, |
394 | std::ios_base::out); |
395 | pos.block_offset = ((xstream::bz::istreambuf*)m_xcmp)->get_block_offset(); |
396 | } |
397 | else if (m_status_bits & k_z_compression) { |
398 | pos.block_start = m_xraw->pubseekoff(0,std::ios_base::cur, |
399 | std::ios_base::out); |
400 | pos.block_offset = ((xstream::z::istreambuf*)m_xcmp)->get_block_offset(); |
401 | } |
402 | else { |
403 | pos.block_start = m_ostr.tellp(); |
404 | pos.block_offset = 0; |
405 | } |
406 | return pos; |
407 | } |
408 | |
409 | void ostream::configure_streambufs() { |
410 | if (m_xstr == 0) { |
411 | m_xstr = new xstream::xdr::ostream(m_sbuf); |
412 | } |
413 | if (m_xraw == 0 && (m_status_bits & k_z_compression) != 0) { |
414 | |
415 | m_xraw = m_ostr.rdbuf(); |
416 | m_xcmp = new xstream::z::ostreambuf(m_xraw); |
417 | m_ostr.rdbuf(m_xcmp); |
418 | } |
419 | else if (m_xraw == 0 && (m_status_bits & k_bz2_compression) != 0) { |
420 | |
421 | m_xraw = m_ostr.rdbuf(); |
422 | m_xcmp = new xstream::bz::ostreambuf(m_xraw); |
423 | m_ostr.rdbuf(m_xcmp); |
424 | } |
425 | else if (m_xraw == 0 && (m_status_bits & k_bits_compression) != 0) { |
426 | throw std::runtime_error("hddm_s::ostream::configure_streambufs error - " |
427 | "unrecognized compression flag requested."); |
428 | } |
429 | else if (m_xraw != 0 && m_bytes_written == 0) { |
430 | m_ostr.rdbuf(m_xraw); |
431 | delete m_xcmp; |
432 | m_xcmp = 0; |
433 | m_xraw = 0; |
434 | configure_streambufs(); |
435 | } |
436 | else if (m_xraw != 0) { |
437 | throw std::runtime_error("hddm_s::ostream::configure_streambufs error" |
438 | " - cannot reconfigure compression in mid-stream!"); |
439 | } |
440 | } |
441 | |
442 | int istream::getTag(const std::string &src, int start, |
443 | std::string &tag, int &level) |
444 | { |
445 | tag = ""; |
446 | size_t p_btag = src.find("<",start); |
447 | size_t p_bline = src.find_last_of("\n",p_btag); |
448 | if (p_bline == std::string::npos) |
449 | { |
450 | p_bline = 0; |
451 | } |
452 | else |
453 | { |
454 | ++p_bline; |
455 | } |
456 | level = (p_btag-p_bline)/2; |
457 | size_t p_etag = p_btag; |
458 | for (size_t quotes=0; p_etag < src.size(); ++p_etag) { |
459 | if (src[p_etag] == '"') { |
460 | tag += "\""; |
461 | ++quotes; |
462 | } |
463 | else if (quotes/2*2 != quotes) { |
464 | tag += src[p_etag]; |
465 | } |
466 | else if (src.find_first_of(" \t\n",p_etag) == 0) { |
467 | tag += " "; |
468 | p_etag = src.find_first_not_of(" \t\n",p_etag)-1; |
469 | } |
470 | else if (src[p_etag] == '>') { |
471 | tag += ">"; |
472 | break; |
473 | } |
474 | else { |
475 | tag += src[p_etag]; |
476 | } |
477 | } |
478 | if (p_etag == src.size()) { |
479 | std::stringstream sstr; |
480 | sstr << "hddm_s::istream::getTag" |
481 | << " error - bad header format" << std::endl |
482 | << " tag " << tag << " at position " << start |
483 | << std::endl; |
484 | throw std::runtime_error(sstr.str()); |
485 | } |
486 | return p_etag+2; |
487 | } |
488 | |
489 | int istream::getEndTag(const std::string &src, int start, |
490 | const std::string &tag) |
491 | { |
492 | if (tag.rfind("/>") == tag.size()-2) { |
493 | return src.find(tag,start) + tag.size()+1; |
494 | } |
495 | else { |
496 | std::string etag = "</"; |
497 | etag += tag.substr(1,tag.find_first_of(' ')-1) + ">"; |
498 | size_t p_etag = src.find(etag,start); |
499 | size_t p_quote = src.find_first_of('"',start); |
500 | while (p_quote != std::string::npos && p_quote < p_etag) { |
501 | p_quote = src.find_first_of('"',p_quote+1); |
502 | if (p_quote > p_etag) { |
503 | p_etag = src.find(etag,p_quote+1); |
504 | } |
505 | p_quote = src.find_first_of('"',p_quote+1); |
506 | } |
507 | if (p_etag == std::string::npos) { |
508 | std::stringstream sstr; |
509 | sstr << "hddm_s::istream::getEndTag" |
510 | << " error - bad header format" << std::endl |
511 | << " tag " << tag << " at position " << start |
512 | << std::endl |
513 | << " end tag " << etag << " not found." |
514 | << std::endl; |
515 | throw std::runtime_error(sstr.str()); |
516 | } |
517 | return p_etag + etag.size()+1; |
518 | } |
519 | } |
520 | |
521 | void istream::collide(const std::string &itag, const std::string &rtag) { |
522 | std::string itagname = itag.substr(1,itag.find(" ")-1); |
523 | std::string rtagname = rtag.substr(1,rtag.find(" ")-1); |
524 | std::string errmsg = "hddm_s::istream::collide warning:\n" |
525 | "tag " + itagname + " in input file " |
526 | "does not match c++ header hddm_s.hpp\n" |
527 | " input file: " + itag + "\n" |
528 | " c++ header: " + rtag + "\n" |
529 | " === Tag " + itagname + " will be ignored," |
530 | " rebuild to cure the problem ==="; |
531 | if (itagname != "HDDM") { |
532 | std::cerr << errmsg << std::endl; |
533 | } |
534 | else { |
535 | throw std::runtime_error(errmsg); |
536 | } |
537 | } |
538 | |
539 | chromosome istream::synthesize(const std::string &src, int p_src, |
540 | const std::string &ref, int p_ref) |
541 | { |
542 | chromosome chrom; |
543 | int slevel, rlevel; |
544 | std::string stag, rtag; |
545 | p_src = getTag(src,p_src,stag,slevel); |
546 | p_ref = getTag(ref,p_ref,rtag,rlevel); |
547 | std::string stagname = stag.substr(1,stag.find(" ")-1); |
548 | std::string rtagname = rtag.substr(1,rtag.find(" ")-1); |
549 | if (stagname != rtagname) { |
550 | throw std::runtime_error("hddm_s::istream::synthesize error - matching algorithm error #2"); |
551 | } |
552 | else if (!tags_match(stag,rtag)) { |
553 | collide(stag,rtag); |
554 | return chrom; |
555 | } |
556 | |
557 | int p2_src, p2_ref; |
558 | int s2level, r2level; |
559 | std::string s2tag, r2tag; |
560 | getTag(src,p2_src=p_src,s2tag,s2level); |
561 | while (s2level > slevel) { |
562 | codon *gene = new codon(); |
563 | std::string s2tagname = s2tag.substr(1,s2tag.find(" ")-1); |
564 | getTag(ref,p2_ref=p_ref,r2tag,r2level); |
565 | int order_of_this_tag_in_ref = 1; |
566 | while (r2level == s2level) { |
567 | std::string r2tagname = r2tag.substr(1,r2tag.find(" ")-1); |
568 | if (s2tagname == r2tagname) { |
569 | if (!tags_match(s2tag,r2tag)) { |
570 | collide(s2tag,r2tag); |
571 | break; |
572 | } |
573 | else { |
574 | gene->m_order = order_of_this_tag_in_ref; |
575 | } |
576 | gene->m_sequence = synthesize(src,p2_src,ref,p2_ref); |
577 | break; |
578 | } |
579 | p2_ref = getEndTag(ref,p2_ref,r2tag); |
580 | getTag(ref,p2_ref,r2tag,r2level); |
581 | ++order_of_this_tag_in_ref; |
582 | } |
583 | gene->m_tagname = s2tagname; |
584 | chrom.push_back(*gene); |
585 | delete gene; |
586 | p2_src = getEndTag(src,p2_src,s2tag); |
587 | getTag(src,p2_src,s2tag,s2level); |
588 | } |
589 | return chrom; |
590 | } |