|
MASA-Core
|
00001 /******************************************************************************* 00002 * 00003 * Copyright (c) 2010-2015 Edans Sandes 00004 * 00005 * This file is part of MASA-Core. 00006 * 00007 * MASA-Core is free software: you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation, either version 3 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * MASA-Core is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with MASA-Core. If not, see <http://www.gnu.org/licenses/>. 00019 * 00020 ******************************************************************************/ 00021 00022 #include "AlignmentBinaryFile.hpp" 00023 00024 #include <stdio.h> 00025 #include <vector> 00026 using namespace std; 00027 #include <arpa/inet.h> 00028 #include <stdlib.h> 00029 #include <string.h> 00030 00031 #include "Constants.hpp" 00032 00033 #include "../../common/BlocksFile.hpp" 00034 00035 #define MAGIC_HEADER "CGFF" 00036 #define MAGIC_HEADER_LEN 4 //in bytes 00037 #define FILE_VERSION_MAJOR 0 00038 #define FILE_VERSION_MINOR 1 00039 00040 00041 #define END_OF_FIELDS (0) 00042 00043 #define FIELD_ALIGNMENT_METHOD (1) 00044 #define FIELD_SCORING_SYSTEM (2) 00045 #define FIELD_PENALTY_SYSTEM (3) 00046 #define FIELD_SEQUENCE_PARAMS (4) 00047 00048 00049 #define FIELD_SEQUENCE_DESCRIPTION (1) 00050 #define FIELD_SEQUENCE_TYPE (2) 00051 #define FIELD_SEQUENCE_SIZE (3) 00052 #define FIELD_SEQUENCE_HASH (4) 00053 #define FIELD_SEQUENCE_DATA_PLAIN (5) 00054 #define FIELD_SEQUENCE_DATA_COMPRESSED (6) 00055 00056 #define FIELD_RESULT_RAW_SCORE (1) 00057 #define FIELD_RESULT_BIT_SCORE (2) 00058 #define FIELD_RESULT_E_VALUE (3) 00059 #define FIELD_RESULT_SCORE_STATISTICS (4) 00060 #define FIELD_RESULT_GAP_LIST (5) 00061 #define FIELD_RESULT_BLOCKS (6) 00062 #define FIELD_RESULT_CELLS (7) 00063 00064 00065 #define MAX_SEQUENCES (2) 00066 00067 00068 void AlignmentBinaryFile::write(string filename, Alignment* alignment) { 00069 FILE* file = fopen(filename.c_str(), "wb"); 00070 00071 fwrite_header(file); 00072 00073 AlignmentParams* params = alignment->getAlignmentParams(); 00074 vector<Sequence*> sequences = params->getSequences(); 00075 vector<SequenceInfo*> info; 00076 info.clear(); 00077 for (int i=0; i<sequences.size(); i++) { 00078 sequences[i]->getInfo()->setIndex(i); 00079 info.push_back(sequences[i]->getInfo()); 00080 } 00081 00082 00083 fwrite_sequences(info, file); 00084 fwrite_alignment_params(params, file); 00085 fwrite_alignment_result(alignment, file); 00086 00087 fclose(file); 00088 } 00089 00090 Alignment* AlignmentBinaryFile::read(string filename) { 00091 FILE* file = fopen(filename.c_str(), "rb"); 00092 00093 fread_header(file); 00094 00095 vector<SequenceInfo*> sequences = fread_sequences(file); 00096 AlignmentParams* params = fread_alignment_params(sequences, file); 00097 Alignment* alignment = fread_alignment_result(params, file); 00098 00099 fclose(file); 00100 return alignment; 00101 } 00102 00103 void AlignmentBinaryFile::fwrite_header(FILE* file) { 00104 fwrite_array(MAGIC_HEADER, MAGIC_HEADER_LEN, file); 00105 fwrite_int1(FILE_VERSION_MAJOR, file); 00106 fwrite_int1(FILE_VERSION_MINOR, file); 00107 } 00108 00109 void AlignmentBinaryFile::fread_header(FILE* file) { 00110 char header[MAGIC_HEADER_LEN]; 00111 fread_array(header, MAGIC_HEADER_LEN, file); 00112 if (memcmp(header, MAGIC_HEADER, MAGIC_HEADER_LEN) != 0) { 00113 fprintf(stderr, "Wrong File Format. Header Error.\n"); 00114 // TODO ERROR 00115 return; 00116 } 00117 int file_version_major = fread_int1(file); 00118 int file_version_minor = fread_int1(file); 00119 if (file_version_major > FILE_VERSION_MAJOR) { 00120 fprintf(stderr, "File Version not supported (%d.%d > %d.%d).\n", 00121 file_version_major, file_version_minor, 00122 FILE_VERSION_MAJOR, FILE_VERSION_MINOR); 00123 // TODO ERROR 00124 return; 00125 } 00126 } 00127 00128 void AlignmentBinaryFile::fwrite_sequences(vector<SequenceInfo*> sequences, FILE* file) { 00129 int count = sequences.size(); 00130 fwrite_int4(count, file); 00131 for (int i=0; i<count; i++) { 00132 SequenceInfo* seq = sequences[i]; 00133 seq->setIndex(i); 00134 00135 fwrite_int1(FIELD_SEQUENCE_DESCRIPTION, file); 00136 fwrite_str(seq->getDescription().c_str(), file); 00137 00138 fwrite_int1(FIELD_SEQUENCE_TYPE, file); 00139 fwrite_int1(seq->getType(), file); 00140 00141 fwrite_int1(FIELD_SEQUENCE_SIZE, file); 00142 fwrite_int4(seq->getSize(), file); 00143 00144 fwrite_int1(END_OF_FIELDS, file); 00145 } 00146 } 00147 00148 vector<SequenceInfo*> AlignmentBinaryFile::fread_sequences(FILE* file) { 00149 vector<SequenceInfo*> sequences; 00150 int count = fread_int4(file); 00151 if (count > MAX_SEQUENCES) { 00152 fprintf(stderr, "Sanity Check: too many sequences (%d > %d).\n", count, MAX_SEQUENCES); 00153 exit(0); 00154 } 00155 for (int i=0; i<count; i++) { 00156 SequenceInfo* info = new SequenceInfo(); 00157 sequences.push_back(info); 00158 int field; 00159 while ((field = fread_int1(file)) != END_OF_FIELDS) { 00160 00161 int field_len; 00162 string str = ""; 00163 switch (field) { 00164 case FIELD_SEQUENCE_DESCRIPTION: 00165 info->setDescription(fread_str(file)); 00166 break; 00167 case FIELD_SEQUENCE_TYPE: 00168 info->setType(fread_int1(file)); 00169 break; 00170 case FIELD_SEQUENCE_SIZE: 00171 info->setSize(fread_int4(file)); 00172 break; 00173 case FIELD_SEQUENCE_HASH: 00174 info->setHash(fread_str(file)); 00175 break; 00176 case FIELD_SEQUENCE_DATA_PLAIN: 00177 field_len = fread_int4(file); 00178 fread_dummy(field_len, file); 00179 break; 00180 case FIELD_SEQUENCE_DATA_COMPRESSED: 00181 field_len = fread_int4(file); 00182 fread_dummy(field_len, file); 00183 break; 00184 default: 00185 fprintf(stderr, "Sanity Check: Unknown Field (%d).\n", field); 00186 exit(0); 00187 break; 00188 00189 } 00190 } 00191 } 00192 return sequences; 00193 } 00194 00195 00196 void AlignmentBinaryFile::fwrite_alignment_params(AlignmentParams* params, FILE* file) { 00197 fwrite_int1(FIELD_ALIGNMENT_METHOD, file); 00198 fwrite_int1(params->getAlignmentMethod(), file); 00199 00200 fwrite_int1(FIELD_SCORING_SYSTEM, file); 00201 fwrite_int1(params->getScoreSystem(), file); 00202 switch (params->getScoreSystem()) { 00203 case SCORE_MATCH_MISMATCH: 00204 fwrite_int4(params->getMatch(), file); 00205 fwrite_int4(params->getMismatch(), file); 00206 break; 00207 default: 00208 fprintf(stderr, "Sanity Check: Unknown Score System (%d).\n", params->getPenaltySystem()); 00209 exit(0); 00210 break; 00211 } 00212 00213 fwrite_int1(FIELD_PENALTY_SYSTEM, file); 00214 fwrite_int1(params->getPenaltySystem(), file); 00215 switch (params->getPenaltySystem()) { 00216 case PENALTY_LINEAR_GAP: 00217 fwrite_int4(params->getGapExtension(), file); 00218 break; 00219 case PENALTY_AFFINE_GAP: 00220 fwrite_int4(params->getGapOpen(), file); 00221 fwrite_int4(params->getGapExtension(), file); 00222 break; 00223 default: 00224 fprintf(stderr, "Sanity Check: Unknown Penalty System (%d).\n", params->getPenaltySystem()); 00225 exit(0); 00226 break; 00227 } 00228 00229 fwrite_int1(FIELD_SEQUENCE_PARAMS, file); 00230 int count = params->getSequencesCount(); 00231 fwrite_int4(count, file); 00232 for (int i=0; i<count; i++) { 00233 SequenceInfo* sequenceInfo = params->getSequence(i)->getInfo(); 00234 fwrite_int4(sequenceInfo->getIndex(), file); 00235 SequenceModifiers* modifiers = params->getSequence(i)->getModifiers(); 00236 fwrite_flags(modifiers, file); 00237 } 00238 00239 fwrite_int1(END_OF_FIELDS, file); 00240 } 00241 00242 AlignmentParams* AlignmentBinaryFile::fread_alignment_params(vector<SequenceInfo*> sequences, FILE* file) { 00243 AlignmentParams* params = new AlignmentParams(); 00244 int field; 00245 while ((field = fread_int1(file)) != END_OF_FIELDS) { 00246 switch (field) { 00247 case FIELD_ALIGNMENT_METHOD: 00248 params->setAlignmentMethod(fread_int1(file)); 00249 break; 00250 case FIELD_SCORING_SYSTEM: 00251 params->setScoreSystem(fread_int1(file)); 00252 switch (params->getScoreSystem()) { 00253 case SCORE_MATCH_MISMATCH: { 00254 int match = fread_int4(file); 00255 int mismatch = fread_int4(file); 00256 params->setMatchMismatchScores(match, mismatch); 00257 break; 00258 } 00259 case SCORE_SIMILARITY_MATRIX: { 00260 fprintf(stderr, "Score Matrix not supported yet.\n"); 00261 exit(0); 00262 break; 00263 } 00264 default: 00265 fprintf(stderr, "Unknown Score System.\n"); 00266 exit(1); 00267 break; 00268 } 00269 break; 00270 case FIELD_PENALTY_SYSTEM: 00271 params->setPenaltySystem(fread_int1(file)); 00272 switch (params->getPenaltySystem()) { 00273 case PENALTY_LINEAR_GAP: { 00274 int gapOpen = 0; 00275 int gapExtension = fread_int4(file); 00276 params->setAffineGapPenalties(gapOpen, gapExtension); 00277 break; 00278 } 00279 case PENALTY_AFFINE_GAP: { 00280 int gapOpen = fread_int4(file); 00281 int gapExtension = fread_int4(file); 00282 params->setAffineGapPenalties(gapOpen, gapExtension); 00283 break; 00284 } 00285 default: 00286 fprintf(stderr, "Unknown Penalty System.\n"); 00287 exit(1); 00288 break; 00289 } 00290 break; 00291 case FIELD_SEQUENCE_PARAMS: { 00292 int count = fread_int4(file); 00293 for (int i=0; i<count; i++) { 00294 int index = fread_int4(file); 00295 00296 SequenceModifiers* modifiers = fread_flags(file); 00297 00298 params->addSequence(new Sequence(sequences[index], modifiers)); 00299 } 00300 break; 00301 } 00302 default: 00303 fprintf(stderr, "Sanity Check: Unknown Field (%d).\n", field); 00304 exit(0); 00305 break; 00306 00307 } 00308 } 00309 return params; 00310 } 00311 00312 00313 void AlignmentBinaryFile::fwrite_alignment_result(Alignment* alignment, FILE* file) { 00314 fwrite_int4(1, file); // Only 1 result is supported by now 00315 int count = alignment->getAlignmentParams()->getSequencesCount(); 00316 00317 fwrite_int1(FIELD_RESULT_RAW_SCORE, file); 00318 fwrite_int4(alignment->getRawScore(), file); 00319 00320 fwrite_int1(FIELD_RESULT_SCORE_STATISTICS, file); 00321 fwrite_int4(alignment->getMatches(), file); 00322 fwrite_int4(alignment->getMismatches(), file); 00323 fwrite_int4(alignment->getGapOpen(), file); 00324 fwrite_int4(alignment->getGapExtensions(), file); 00325 00326 fwrite_int1(FIELD_RESULT_GAP_LIST, file); 00327 for (int i=0; i<count; i++) { 00328 fwrite_int4(alignment->getStart(i), file); 00329 fwrite_int4(alignment->getEnd(i), file); 00330 fwrite_gaps(alignment->getGaps(i), file); 00331 } 00332 00333 if (!alignment->getPruningFile().empty()) { 00334 //FILE* pruningFile = fopen(alignment->getPruningFile().c_str(), "rb"); 00335 fwrite_int1(FIELD_RESULT_BLOCKS, file); 00336 BlocksFile* blocksFile = new BlocksFile(alignment->getPruningFile()); 00337 int bw = 512; 00338 int bh = 512; 00339 int* buffer = blocksFile->reduceData(bh, bw); 00340 fwrite_int4(bh, file); 00341 fwrite_int4(bw, file); 00342 for (int i=0; i<bh*bw; i++) { 00343 fwrite_int4(buffer[i], file); 00344 } 00345 delete blocksFile; 00346 // int i; 00347 // while (fread(&i, sizeof(int), 1, pruningFile) > 0) { 00348 // fwrite_int4(i, file); 00349 // } 00350 // fclose(pruningFile); 00351 } 00352 00353 fwrite_int1(END_OF_FIELDS, file); 00354 } 00355 00356 Alignment* AlignmentBinaryFile::fread_alignment_result(AlignmentParams* params, FILE* file) { 00357 int results = fread_int4(file); 00358 if (results > 1) { 00359 // Only 1 result is supported by now 00360 fprintf(stderr, "Too many results %d.\n", results); 00361 exit(1); 00362 } 00363 00364 Alignment* alignment = new Alignment(params); 00365 int count = params->getSequencesCount(); 00366 int field; 00367 while ((field = fread_int1(file)) != END_OF_FIELDS) { 00368 switch (field) { 00369 case FIELD_RESULT_RAW_SCORE: 00370 alignment->setRawScore(fread_int4(file)); 00371 break; 00372 case FIELD_RESULT_SCORE_STATISTICS: 00373 alignment->setMatches(fread_int4(file)); 00374 alignment->setMismatches(fread_int4(file)); 00375 alignment->setGapOpen(fread_int4(file)); 00376 alignment->setGapExtensions(fread_int4(file)); 00377 break; 00378 case FIELD_RESULT_GAP_LIST:{ 00379 for (int i=0; i<count; i++) { 00380 alignment->setStart(i, fread_int4(file)); 00381 alignment->setEnd(i, fread_int4(file)); 00382 fread_gaps(alignment->getGaps(i), file); 00383 } 00384 break; 00385 } 00386 case FIELD_RESULT_BLOCKS:{ 00387 int h = fread_int4(file); 00388 int w = fread_int4(file); 00389 // NOT IMPLEMENTED FOR READING 00390 fread_dummy(h*w*sizeof(int), file); 00391 break; 00392 } 00393 default: 00394 fprintf(stderr, "Sanity Check: Unknown Field (%d).\n", field); 00395 exit(0); 00396 break; 00397 00398 } 00399 } 00400 return alignment; 00401 } 00402 00403 void AlignmentBinaryFile::fwrite_flags(SequenceModifiers* modifiers, FILE* file) { 00404 int flags = 0; 00405 if (modifiers->isClearN()) { 00406 flags |= FLAG_CLEAR_N; 00407 } 00408 if (modifiers->isComplement()) { 00409 flags |= FLAG_COMPLEMENT; 00410 } 00411 if (modifiers->isReverse()) { 00412 flags |= FLAG_REVERSE; 00413 } 00414 fwrite_int4(flags, file); 00415 fwrite_int4(modifiers->getTrimStart(), file); 00416 fwrite_int4(modifiers->getTrimEnd(), file); 00417 } 00418 00419 SequenceModifiers* AlignmentBinaryFile::fread_flags(FILE* file) { 00420 SequenceModifiers* modifiers = new SequenceModifiers(); 00421 00422 int flags = fread_int4(file); 00423 modifiers->setClearN ((flags & FLAG_CLEAR_N) != 0); 00424 modifiers->setComplement((flags & FLAG_COMPLEMENT) != 0); 00425 modifiers->setReverse ((flags & FLAG_REVERSE) != 0); 00426 00427 modifiers->setTrimStart(fread_int4(file)); 00428 modifiers->setTrimEnd(fread_int4(file)); 00429 00430 return modifiers; 00431 } 00432 00433 void AlignmentBinaryFile::fwrite_gaps(vector<gap_t>* gaps, FILE* file) { 00434 int count = gaps->size(); 00435 fwrite_int4(count, file); 00436 //printf("Gaps[%d]\n", count); 00437 int last = 0; 00438 for (int i=0; i<count; i++) { 00439 int diff = (*gaps)[i].pos - last; 00440 last = (*gaps)[i].pos; 00441 //fwrite_int4((*gaps)[i].pos, file); 00442 //fwrite_int4((*gaps)[i].len, file); 00443 00444 fwrite_uint4_compressed(diff, file); 00445 fwrite_uint4_compressed((*gaps)[i].len, file); 00446 00447 //printf("%4d: %8d %d\n", i, (*gaps)[i].pos, (*gaps)[i].len); 00448 //printf("%4d: +%8d %d\n", i, diff, (*gaps)[i].len); 00449 } 00450 } 00451 00452 00453 00454 void AlignmentBinaryFile::fread_gaps(vector<gap_t>* gaps, FILE* file) { 00455 int count = fread_int4(file); 00456 gaps->clear(); 00457 //printf("Gaps[%d]\n", count); 00458 int last = 0; 00459 for (int i=0; i<count; i++) { 00460 gap_t gap; 00461 //gap.pos = fread_int4(file); 00462 //gap.len = fread_int4(file); 00463 00464 gap.pos = last + fread_uint4_compressed(file); 00465 last = gap.pos; 00466 gap.len = fread_uint4_compressed(file); 00467 gaps->push_back(gap); 00468 //printf("%4d: %8d %d\n", i, gap.pos, gap.len); 00469 } 00470 } 00471 00472 00473 void AlignmentBinaryFile::fwrite_str(const char* str, FILE* file) { 00474 int len = strlen(str); 00475 fwrite_int4(len, file); 00476 fwrite(str, sizeof(char), len, file); 00477 } 00478 00479 string AlignmentBinaryFile::fread_str(FILE* file) { 00480 int len = fread_int4(file); 00481 const int max_len = 1000; 00482 if (len > max_len) { 00483 fprintf(stderr, "Sanity Check: string too large during file read (%d > %d).\n", len, max_len); 00484 exit(0); 00485 } 00486 char cstr[max_len+5]; 00487 fread(cstr, sizeof(char), len, file); 00488 cstr[len] = '\0'; 00489 return string(cstr); 00490 } 00491 00492 00493 00494 00495 00496 00497 00498 00499 00500 void AlignmentBinaryFile::fwrite_array(const char* array, const int len, FILE* file) { 00501 fwrite(array, sizeof(char), len, file); 00502 } 00503 00504 void AlignmentBinaryFile::fread_array(char* array, const int len, FILE* file) { 00505 fread(array, sizeof(char), len, file); 00506 } 00507 00508 00509 void AlignmentBinaryFile::fwrite_uint4_compressed(const unsigned int i, FILE* file) { 00510 unsigned int b0 = (i & 0xF0000000)>>28; 00511 unsigned int b1 = (i & 0x0FE00000)>>21; 00512 unsigned int b2 = (i & 0x001FC000)>>14; 00513 unsigned int b3 = (i & 0x00003F80)>>7; 00514 unsigned int b4 = (i & 0x0000007F); 00515 00516 if (b0 > 0) { 00517 fwrite_int1(0x80|b0, file); 00518 fwrite_int1(0x80|b1, file); 00519 fwrite_int1(0x80|b2, file); 00520 fwrite_int1(0x80|b3, file); 00521 fwrite_int1(b4, file); 00522 } else if (b1 > 0) { 00523 fwrite_int1(0x80|b1, file); 00524 fwrite_int1(0x80|b2, file); 00525 fwrite_int1(0x80|b3, file); 00526 fwrite_int1(b4, file); 00527 } else if (b2 > 0) { 00528 fwrite_int1(0x80|b2, file); 00529 fwrite_int1(0x80|b3, file); 00530 fwrite_int1(b4, file); 00531 } else if (b3 > 0) { 00532 fwrite_int1(0x80|b3, file); 00533 fwrite_int1(b4, file); 00534 } else { 00535 fwrite_int1(b4, file); 00536 } 00537 00538 } 00539 00540 unsigned int AlignmentBinaryFile::fread_uint4_compressed(FILE* file) { 00541 unsigned char b = fread_int1(file); 00542 unsigned int i = (b & 0x7F); 00543 while (b >= 128) { 00544 b = fread_int1(file); 00545 i <<= 7; 00546 i |= (b & 0x7F); 00547 } 00548 return i; 00549 } 00550 00551 void AlignmentBinaryFile::fwrite_int4(const int i, FILE* file) { 00552 const int j = htonl(i); 00553 fwrite(&j, sizeof(int), 1, file); 00554 } 00555 00556 int AlignmentBinaryFile::fread_int4(FILE* file) { 00557 int i; 00558 fread(&i, sizeof(int), 1, file); 00559 return ntohl(i); 00560 } 00561 00562 void AlignmentBinaryFile::fwrite_int2(const short i, FILE* file) { 00563 const short j = htons(i); 00564 fwrite(&j, sizeof(short), 1, file); 00565 } 00566 00567 short AlignmentBinaryFile::fread_int2(FILE* file) { 00568 short i; 00569 fread(&i, sizeof(short), 1, file); 00570 return ntohs(i); 00571 } 00572 00573 void AlignmentBinaryFile::fwrite_int1(const unsigned char i, FILE* file) { 00574 fwrite(&i, sizeof(unsigned char), 1, file); 00575 } 00576 00577 unsigned char AlignmentBinaryFile::fread_int1(FILE* file) { 00578 unsigned char i; 00579 fread(&i, sizeof(unsigned char), 1, file); 00580 return i; 00581 } 00582 00583 void AlignmentBinaryFile::fread_dummy(const int len, FILE* file) { 00584 fseek(file, len, SEEK_CUR); 00585 } 00586 00587 00588
1.7.6.1