MASA-Core
AlignmentBinaryFile.cpp
Go to the documentation of this file.
00001 /*******************************************************************************
00002  *
00003  * Copyright (c) 2010-2015   Edans Sandes
00004  *
00005  * This file is part of MASA-Core.
00006  * 
00007  * MASA-Core is free software: you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation, either version 3 of the License, or
00010  * (at your option) any later version.
00011  * 
00012  * MASA-Core is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  * 
00017  * You should have received a copy of the GNU General Public License
00018  * along with MASA-Core.  If not, see <http://www.gnu.org/licenses/>.
00019  *
00020  ******************************************************************************/
00021 
00022 #include "AlignmentBinaryFile.hpp"
00023 
00024 #include <stdio.h>
00025 #include <vector>
00026 using namespace std;
00027 #include <arpa/inet.h>
00028 #include <stdlib.h>
00029 #include <string.h>
00030 
00031 #include "Constants.hpp"
00032 
00033 #include "../../common/BlocksFile.hpp"
00034 
00035 #define MAGIC_HEADER            "CGFF"
00036 #define MAGIC_HEADER_LEN        4 //in bytes
00037 #define FILE_VERSION_MAJOR      0
00038 #define FILE_VERSION_MINOR      1
00039 
00040 
00041 #define END_OF_FIELDS                                   (0)
00042 
00043 #define FIELD_ALIGNMENT_METHOD                  (1)
00044 #define FIELD_SCORING_SYSTEM                    (2)
00045 #define FIELD_PENALTY_SYSTEM                    (3)
00046 #define FIELD_SEQUENCE_PARAMS                   (4)
00047 
00048 
00049 #define FIELD_SEQUENCE_DESCRIPTION              (1)
00050 #define FIELD_SEQUENCE_TYPE                             (2)
00051 #define FIELD_SEQUENCE_SIZE                             (3)
00052 #define FIELD_SEQUENCE_HASH                             (4)
00053 #define FIELD_SEQUENCE_DATA_PLAIN               (5)
00054 #define FIELD_SEQUENCE_DATA_COMPRESSED  (6)
00055 
00056 #define FIELD_RESULT_RAW_SCORE                  (1)
00057 #define FIELD_RESULT_BIT_SCORE                  (2)
00058 #define FIELD_RESULT_E_VALUE                    (3)
00059 #define FIELD_RESULT_SCORE_STATISTICS   (4)
00060 #define FIELD_RESULT_GAP_LIST                   (5)
00061 #define FIELD_RESULT_BLOCKS                             (6)
00062 #define FIELD_RESULT_CELLS                              (7)
00063 
00064 
00065 #define MAX_SEQUENCES   (2)
00066 
00067 
00068 void AlignmentBinaryFile::write(string filename, Alignment* alignment) {
00069     FILE* file = fopen(filename.c_str(), "wb");
00070 
00071     fwrite_header(file);
00072 
00073     AlignmentParams* params = alignment->getAlignmentParams();
00074     vector<Sequence*> sequences = params->getSequences();
00075     vector<SequenceInfo*> info;
00076     info.clear();
00077     for (int i=0; i<sequences.size(); i++) {
00078         sequences[i]->getInfo()->setIndex(i);
00079         info.push_back(sequences[i]->getInfo());
00080     }
00081 
00082 
00083     fwrite_sequences(info, file);
00084     fwrite_alignment_params(params, file);
00085     fwrite_alignment_result(alignment, file);
00086 
00087     fclose(file);
00088 }
00089 
00090 Alignment* AlignmentBinaryFile::read(string filename) {
00091         FILE* file = fopen(filename.c_str(), "rb");
00092 
00093         fread_header(file);
00094 
00095         vector<SequenceInfo*> sequences = fread_sequences(file);
00096         AlignmentParams* params = fread_alignment_params(sequences, file);
00097         Alignment* alignment = fread_alignment_result(params, file);
00098 
00099         fclose(file);
00100         return alignment;
00101 }
00102 
00103 void AlignmentBinaryFile::fwrite_header(FILE* file) {
00104         fwrite_array(MAGIC_HEADER, MAGIC_HEADER_LEN, file);
00105         fwrite_int1(FILE_VERSION_MAJOR, file);
00106         fwrite_int1(FILE_VERSION_MINOR, file);
00107 }
00108 
00109 void AlignmentBinaryFile::fread_header(FILE* file) {
00110         char header[MAGIC_HEADER_LEN];
00111         fread_array(header, MAGIC_HEADER_LEN, file);
00112         if (memcmp(header, MAGIC_HEADER, MAGIC_HEADER_LEN) != 0) {
00113                 fprintf(stderr, "Wrong File Format. Header Error.\n");
00114                 // TODO ERROR
00115                 return;
00116         }
00117         int file_version_major = fread_int1(file);
00118         int file_version_minor = fread_int1(file);
00119         if (file_version_major > FILE_VERSION_MAJOR) {
00120                 fprintf(stderr, "File Version not supported (%d.%d > %d.%d).\n",
00121                                 file_version_major, file_version_minor,
00122                                 FILE_VERSION_MAJOR, FILE_VERSION_MINOR);
00123                 // TODO ERROR
00124                 return;
00125         }
00126 }
00127 
00128 void AlignmentBinaryFile::fwrite_sequences(vector<SequenceInfo*> sequences, FILE* file) {
00129         int count = sequences.size();
00130         fwrite_int4(count, file);
00131         for (int i=0; i<count; i++) {
00132                 SequenceInfo* seq = sequences[i];
00133                 seq->setIndex(i);
00134 
00135                 fwrite_int1(FIELD_SEQUENCE_DESCRIPTION, file);
00136                 fwrite_str(seq->getDescription().c_str(), file);
00137 
00138                 fwrite_int1(FIELD_SEQUENCE_TYPE, file);
00139                 fwrite_int1(seq->getType(), file);
00140 
00141                 fwrite_int1(FIELD_SEQUENCE_SIZE, file);
00142                 fwrite_int4(seq->getSize(), file);
00143 
00144                 fwrite_int1(END_OF_FIELDS, file);
00145         }
00146 }
00147 
00148 vector<SequenceInfo*> AlignmentBinaryFile::fread_sequences(FILE* file) {
00149         vector<SequenceInfo*> sequences;
00150         int count = fread_int4(file);
00151         if (count > MAX_SEQUENCES) {
00152                 fprintf(stderr, "Sanity Check: too many sequences (%d > %d).\n", count, MAX_SEQUENCES);
00153                 exit(0);
00154         }
00155         for (int i=0; i<count; i++) {
00156                 SequenceInfo* info = new SequenceInfo();
00157                 sequences.push_back(info);
00158                 int field;
00159                 while ((field = fread_int1(file)) != END_OF_FIELDS) {
00160 
00161                         int field_len;
00162                         string str = "";
00163                         switch (field) {
00164                                 case FIELD_SEQUENCE_DESCRIPTION:
00165                                         info->setDescription(fread_str(file));
00166                                         break;
00167                                 case FIELD_SEQUENCE_TYPE:
00168                                         info->setType(fread_int1(file));
00169                                         break;
00170                                 case FIELD_SEQUENCE_SIZE:
00171                                         info->setSize(fread_int4(file));
00172                                         break;
00173                                 case FIELD_SEQUENCE_HASH:
00174                                         info->setHash(fread_str(file));
00175                                         break;
00176                                 case FIELD_SEQUENCE_DATA_PLAIN:
00177                                         field_len = fread_int4(file);
00178                                         fread_dummy(field_len, file);
00179                                         break;
00180                                 case FIELD_SEQUENCE_DATA_COMPRESSED:
00181                                         field_len = fread_int4(file);
00182                                         fread_dummy(field_len, file);
00183                                         break;
00184                                 default:
00185                                         fprintf(stderr, "Sanity Check: Unknown Field (%d).\n", field);
00186                                         exit(0);
00187                                         break;
00188 
00189                         }
00190                 }
00191         }
00192         return sequences;
00193 }
00194 
00195 
00196 void AlignmentBinaryFile::fwrite_alignment_params(AlignmentParams* params, FILE* file) {
00197         fwrite_int1(FIELD_ALIGNMENT_METHOD, file);
00198         fwrite_int1(params->getAlignmentMethod(), file);
00199 
00200         fwrite_int1(FIELD_SCORING_SYSTEM, file);
00201         fwrite_int1(params->getScoreSystem(), file);
00202         switch (params->getScoreSystem()) {
00203                 case SCORE_MATCH_MISMATCH:
00204                         fwrite_int4(params->getMatch(), file);
00205                         fwrite_int4(params->getMismatch(), file);
00206                         break;
00207                 default:
00208                         fprintf(stderr, "Sanity Check: Unknown Score System (%d).\n", params->getPenaltySystem());
00209                         exit(0);
00210                         break;
00211         }
00212 
00213         fwrite_int1(FIELD_PENALTY_SYSTEM, file);
00214         fwrite_int1(params->getPenaltySystem(), file);
00215         switch (params->getPenaltySystem()) {
00216                 case PENALTY_LINEAR_GAP:
00217                         fwrite_int4(params->getGapExtension(), file);
00218                         break;
00219                 case PENALTY_AFFINE_GAP:
00220                         fwrite_int4(params->getGapOpen(), file);
00221                         fwrite_int4(params->getGapExtension(), file);
00222                         break;
00223                 default:
00224                         fprintf(stderr, "Sanity Check: Unknown Penalty System (%d).\n", params->getPenaltySystem());
00225                         exit(0);
00226                         break;
00227         }
00228 
00229         fwrite_int1(FIELD_SEQUENCE_PARAMS, file);
00230         int count = params->getSequencesCount();
00231         fwrite_int4(count, file);
00232         for (int i=0; i<count; i++) {
00233                 SequenceInfo* sequenceInfo = params->getSequence(i)->getInfo();
00234                 fwrite_int4(sequenceInfo->getIndex(), file);
00235                 SequenceModifiers* modifiers = params->getSequence(i)->getModifiers();
00236                 fwrite_flags(modifiers, file);
00237         }
00238 
00239         fwrite_int1(END_OF_FIELDS, file);
00240 }
00241 
00242 AlignmentParams* AlignmentBinaryFile::fread_alignment_params(vector<SequenceInfo*> sequences, FILE* file) {
00243         AlignmentParams* params = new AlignmentParams();
00244         int field;
00245         while ((field = fread_int1(file)) != END_OF_FIELDS) {
00246                 switch (field) {
00247                         case FIELD_ALIGNMENT_METHOD:
00248                                 params->setAlignmentMethod(fread_int1(file));
00249                                 break;
00250                         case FIELD_SCORING_SYSTEM:
00251                                 params->setScoreSystem(fread_int1(file));
00252                                 switch (params->getScoreSystem()) {
00253                                         case SCORE_MATCH_MISMATCH: {
00254                                                 int match = fread_int4(file);
00255                                                 int mismatch = fread_int4(file);
00256                                                 params->setMatchMismatchScores(match, mismatch);
00257                                                 break;
00258                                         }
00259                                         case SCORE_SIMILARITY_MATRIX: {
00260                                                 fprintf(stderr, "Score Matrix not supported yet.\n");
00261                                                 exit(0);
00262                                                 break;
00263                                         }
00264                                         default:
00265                                                 fprintf(stderr, "Unknown Score System.\n");
00266                                                 exit(1);
00267                                                 break;
00268                                 }
00269                                 break;
00270                         case FIELD_PENALTY_SYSTEM:
00271                                 params->setPenaltySystem(fread_int1(file));
00272                                 switch (params->getPenaltySystem()) {
00273                                         case PENALTY_LINEAR_GAP: {
00274                                                 int gapOpen = 0;
00275                                                 int gapExtension = fread_int4(file);
00276                                                 params->setAffineGapPenalties(gapOpen, gapExtension);
00277                                                 break;
00278                                         }
00279                                         case PENALTY_AFFINE_GAP: {
00280                                                 int gapOpen = fread_int4(file);
00281                                                 int gapExtension = fread_int4(file);
00282                                                 params->setAffineGapPenalties(gapOpen, gapExtension);
00283                                                 break;
00284                                         }
00285                                         default:
00286                                                 fprintf(stderr, "Unknown Penalty System.\n");
00287                                                 exit(1);
00288                                                 break;
00289                                 }
00290                                 break;
00291                         case FIELD_SEQUENCE_PARAMS: {
00292                                 int count = fread_int4(file);
00293                                 for (int i=0; i<count; i++) {
00294                                         int index = fread_int4(file);
00295 
00296                                         SequenceModifiers* modifiers = fread_flags(file);
00297 
00298                                         params->addSequence(new Sequence(sequences[index], modifiers));
00299                                 }
00300                                 break;
00301                         }
00302                         default:
00303                                 fprintf(stderr, "Sanity Check: Unknown Field (%d).\n", field);
00304                                 exit(0);
00305                                 break;
00306 
00307                 }
00308         }
00309         return params;
00310 }
00311 
00312 
00313 void AlignmentBinaryFile::fwrite_alignment_result(Alignment* alignment, FILE* file) {
00314         fwrite_int4(1, file); // Only 1 result is supported by now
00315         int count = alignment->getAlignmentParams()->getSequencesCount();
00316 
00317         fwrite_int1(FIELD_RESULT_RAW_SCORE, file);
00318         fwrite_int4(alignment->getRawScore(), file);
00319 
00320         fwrite_int1(FIELD_RESULT_SCORE_STATISTICS, file);
00321         fwrite_int4(alignment->getMatches(), file);
00322         fwrite_int4(alignment->getMismatches(), file);
00323         fwrite_int4(alignment->getGapOpen(), file);
00324         fwrite_int4(alignment->getGapExtensions(), file);
00325 
00326         fwrite_int1(FIELD_RESULT_GAP_LIST, file);
00327         for (int i=0; i<count; i++) {
00328                 fwrite_int4(alignment->getStart(i), file);
00329                 fwrite_int4(alignment->getEnd(i), file);
00330                 fwrite_gaps(alignment->getGaps(i), file);
00331         }
00332 
00333         if (!alignment->getPruningFile().empty()) {
00334                 //FILE* pruningFile = fopen(alignment->getPruningFile().c_str(), "rb");
00335                 fwrite_int1(FIELD_RESULT_BLOCKS, file);
00336                 BlocksFile* blocksFile = new BlocksFile(alignment->getPruningFile());
00337                 int bw = 512;
00338                 int bh = 512;
00339                 int* buffer = blocksFile->reduceData(bh, bw);
00340                 fwrite_int4(bh, file);
00341                 fwrite_int4(bw, file);
00342                 for (int i=0; i<bh*bw; i++) {
00343                         fwrite_int4(buffer[i], file);
00344                 }
00345                 delete blocksFile;
00346 //              int i;
00347 //              while (fread(&i, sizeof(int), 1, pruningFile) > 0) {
00348 //                      fwrite_int4(i, file);
00349 //              }
00350 //              fclose(pruningFile);
00351         }
00352 
00353         fwrite_int1(END_OF_FIELDS, file);
00354 }
00355 
00356 Alignment* AlignmentBinaryFile::fread_alignment_result(AlignmentParams* params, FILE* file) {
00357         int results = fread_int4(file);
00358         if (results > 1) {
00359                 // Only 1 result is supported by now
00360                 fprintf(stderr, "Too many results %d.\n", results);
00361                 exit(1);
00362         }
00363 
00364         Alignment* alignment = new Alignment(params);
00365         int count = params->getSequencesCount();
00366         int field;
00367         while ((field = fread_int1(file)) != END_OF_FIELDS) {
00368                 switch (field) {
00369                         case FIELD_RESULT_RAW_SCORE:
00370                                 alignment->setRawScore(fread_int4(file));
00371                                 break;
00372                         case FIELD_RESULT_SCORE_STATISTICS:
00373                                 alignment->setMatches(fread_int4(file));
00374                                 alignment->setMismatches(fread_int4(file));
00375                                 alignment->setGapOpen(fread_int4(file));
00376                                 alignment->setGapExtensions(fread_int4(file));
00377                                 break;
00378                         case FIELD_RESULT_GAP_LIST:{
00379                                 for (int i=0; i<count; i++) {
00380                                         alignment->setStart(i, fread_int4(file));
00381                                         alignment->setEnd(i, fread_int4(file));
00382                                         fread_gaps(alignment->getGaps(i), file);
00383                                 }
00384                                 break;
00385                         }
00386                         case FIELD_RESULT_BLOCKS:{
00387                                 int h = fread_int4(file);
00388                                 int w = fread_int4(file);
00389                                 // NOT IMPLEMENTED FOR READING
00390                                 fread_dummy(h*w*sizeof(int), file);
00391                                 break;
00392                         }
00393                         default:
00394                                 fprintf(stderr, "Sanity Check: Unknown Field (%d).\n", field);
00395                                 exit(0);
00396                                 break;
00397 
00398                 }
00399         }
00400         return alignment;
00401 }
00402 
00403 void AlignmentBinaryFile::fwrite_flags(SequenceModifiers* modifiers, FILE* file) {
00404         int flags = 0;
00405         if (modifiers->isClearN()) {
00406                 flags |= FLAG_CLEAR_N;
00407         }
00408         if (modifiers->isComplement()) {
00409                 flags |= FLAG_COMPLEMENT;
00410         }
00411         if (modifiers->isReverse()) {
00412                 flags |= FLAG_REVERSE;
00413         }
00414         fwrite_int4(flags, file);
00415         fwrite_int4(modifiers->getTrimStart(), file);
00416         fwrite_int4(modifiers->getTrimEnd(), file);
00417 }
00418 
00419 SequenceModifiers* AlignmentBinaryFile::fread_flags(FILE* file) {
00420         SequenceModifiers* modifiers = new SequenceModifiers();
00421 
00422         int flags = fread_int4(file);
00423         modifiers->setClearN    ((flags & FLAG_CLEAR_N) != 0);
00424         modifiers->setComplement((flags & FLAG_COMPLEMENT) != 0);
00425         modifiers->setReverse   ((flags & FLAG_REVERSE) != 0);
00426 
00427         modifiers->setTrimStart(fread_int4(file));
00428         modifiers->setTrimEnd(fread_int4(file));
00429 
00430         return modifiers;
00431 }
00432 
00433 void AlignmentBinaryFile::fwrite_gaps(vector<gap_t>* gaps, FILE* file) {
00434     int count = gaps->size();
00435         fwrite_int4(count, file);
00436         //printf("Gaps[%d]\n", count);
00437         int last = 0;
00438         for (int i=0; i<count; i++) {
00439                 int diff = (*gaps)[i].pos - last;
00440                 last = (*gaps)[i].pos;
00441                 //fwrite_int4((*gaps)[i].pos, file);
00442                 //fwrite_int4((*gaps)[i].len, file);
00443 
00444                 fwrite_uint4_compressed(diff, file);
00445                 fwrite_uint4_compressed((*gaps)[i].len, file);
00446 
00447                 //printf("%4d: %8d %d\n", i, (*gaps)[i].pos, (*gaps)[i].len);
00448                 //printf("%4d: +%8d %d\n", i, diff, (*gaps)[i].len);
00449         }
00450 }
00451 
00452 
00453 
00454 void AlignmentBinaryFile::fread_gaps(vector<gap_t>* gaps, FILE* file) {
00455         int count = fread_int4(file);
00456         gaps->clear();
00457         //printf("Gaps[%d]\n", count);
00458         int last = 0;
00459         for (int i=0; i<count; i++) {
00460                 gap_t gap;
00461                 //gap.pos = fread_int4(file);
00462                 //gap.len = fread_int4(file);
00463 
00464                 gap.pos = last + fread_uint4_compressed(file);
00465                 last = gap.pos;
00466                 gap.len = fread_uint4_compressed(file);
00467                 gaps->push_back(gap);
00468                 //printf("%4d: %8d %d\n", i, gap.pos, gap.len);
00469         }
00470 }
00471 
00472 
00473 void AlignmentBinaryFile::fwrite_str(const char* str, FILE* file) {
00474     int len = strlen(str);
00475     fwrite_int4(len, file);
00476     fwrite(str, sizeof(char), len, file);
00477 }
00478 
00479 string AlignmentBinaryFile::fread_str(FILE* file) {
00480     int len = fread_int4(file);
00481         const int max_len = 1000;
00482         if (len > max_len) {
00483                 fprintf(stderr, "Sanity Check: string too large during file read (%d > %d).\n", len, max_len);
00484                 exit(0);
00485         }
00486         char cstr[max_len+5];
00487     fread(cstr, sizeof(char), len, file);
00488     cstr[len] = '\0';
00489         return string(cstr);
00490 }
00491 
00492 
00493 
00494 
00495 
00496 
00497 
00498 
00499 
00500 void AlignmentBinaryFile::fwrite_array(const char* array, const int len, FILE* file) {
00501     fwrite(array, sizeof(char), len, file);
00502 }
00503 
00504 void AlignmentBinaryFile::fread_array(char* array, const int len, FILE* file) {
00505     fread(array, sizeof(char), len, file);
00506 }
00507 
00508 
00509 void AlignmentBinaryFile::fwrite_uint4_compressed(const unsigned int i, FILE* file) {
00510         unsigned int b0 = (i & 0xF0000000)>>28;
00511         unsigned int b1 = (i & 0x0FE00000)>>21;
00512         unsigned int b2 = (i & 0x001FC000)>>14;
00513         unsigned int b3 = (i & 0x00003F80)>>7;
00514         unsigned int b4 = (i & 0x0000007F);
00515 
00516         if (b0 > 0) {
00517                 fwrite_int1(0x80|b0, file);
00518                 fwrite_int1(0x80|b1, file);
00519                 fwrite_int1(0x80|b2, file);
00520                 fwrite_int1(0x80|b3, file);
00521                 fwrite_int1(b4, file);
00522         } else if (b1 > 0) {
00523                 fwrite_int1(0x80|b1, file);
00524                 fwrite_int1(0x80|b2, file);
00525                 fwrite_int1(0x80|b3, file);
00526                 fwrite_int1(b4, file);
00527         } else if (b2 > 0) {
00528                 fwrite_int1(0x80|b2, file);
00529                 fwrite_int1(0x80|b3, file);
00530                 fwrite_int1(b4, file);
00531         } else if (b3 > 0) {
00532                 fwrite_int1(0x80|b3, file);
00533                 fwrite_int1(b4, file);
00534         } else {
00535                 fwrite_int1(b4, file);
00536         }
00537 
00538 }
00539 
00540 unsigned int AlignmentBinaryFile::fread_uint4_compressed(FILE* file) {
00541         unsigned char b = fread_int1(file);
00542         unsigned int i = (b & 0x7F);
00543         while (b >= 128) {
00544                 b = fread_int1(file);
00545                 i <<= 7;
00546                 i |= (b & 0x7F);
00547         }
00548         return i;
00549 }
00550 
00551 void AlignmentBinaryFile::fwrite_int4(const int i, FILE* file) {
00552         const int j = htonl(i);
00553     fwrite(&j, sizeof(int), 1, file);
00554 }
00555 
00556 int AlignmentBinaryFile::fread_int4(FILE* file) {
00557         int i;
00558         fread(&i, sizeof(int), 1, file);
00559         return ntohl(i);
00560 }
00561 
00562 void AlignmentBinaryFile::fwrite_int2(const short i, FILE* file) {
00563         const short j = htons(i);
00564     fwrite(&j, sizeof(short), 1, file);
00565 }
00566 
00567 short AlignmentBinaryFile::fread_int2(FILE* file) {
00568         short i;
00569         fread(&i, sizeof(short), 1, file);
00570         return ntohs(i);
00571 }
00572 
00573 void AlignmentBinaryFile::fwrite_int1(const unsigned char i, FILE* file) {
00574     fwrite(&i, sizeof(unsigned char), 1, file);
00575 }
00576 
00577 unsigned char AlignmentBinaryFile::fread_int1(FILE* file) {
00578         unsigned char i;
00579         fread(&i, sizeof(unsigned char), 1, file);
00580         return i;
00581 }
00582 
00583 void AlignmentBinaryFile::fread_dummy(const int len, FILE* file) {
00584         fseek(file, len, SEEK_CUR);
00585 }
00586 
00587 
00588