18#include "PedigreeDescription.h"
19#include "MapFunction.h"
20#include "MathVector.h"
22#include "FortranFormat.h"
30PedigreeDescription::PedigreeDescription()
36PedigreeDescription::~PedigreeDescription()
41 columnCount = rhs.columnCount;
43 columns = rhs.columns;
44 columnHash = rhs.columnHash;
49void PedigreeDescription::Load(
IFILE & input,
bool warnIfLinkage)
57 ReadLineHelper(input, buffer, tokens);
60 if (tokens.Length() == 4 && isdigit(tokens[0][0]))
62 if (warnIfLinkage) printf(
"Data file looks like a LINKAGE format file...\n\n");
63 LoadLinkageDataFile(input);
67 if (buffer.Length() > 18
68 && (buffer.SubStr(8,8).SlowCompare(
"AUTOSOME") == 0 ||
69 buffer.SubStr(8,8).SlowCompare(
"X-LINKED") == 0)
70 && (isdigit(buffer[16]) || isdigit(buffer[17]))
71 && (isdigit(buffer[18]) || isdigit(buffer[19]) ||
72 (buffer.Length() > 19 && isdigit(buffer[20]))))
74 printf(
"Data file looks like a MENDEL format file...\n"
75 " Activating EXPERIMENTAL support for this format\n\n");
76 LoadMendelDataFile(input);
89 while (!
ifeof(input) && !done)
93 buffer.ReadLine(input);
97 tokens.AddTokens(buffer, WHITESPACE);
99 if (tokens.Length() < 1)
continue;
101 if (tokens.Length() == 1)
102 error(
"Problem reading data file:\n"
103 "Item #%d (of type %s) has no name.",
104 columnCount+1, (
const char *) tokens[0]);
106 switch (toupper(tokens[0][0]))
109 columnHash.Push(GetAffectionID(tokens[1]));
110 columns.Push(pcAffection);
114 columnHash.Push(GetMarkerID(tokens[1]));
115 columns.Push(pcMarker);
119 columnHash.Push(GetTraitID(tokens[1]));
120 columns.Push(pcTrait);
124 columnHash.Push(GetCovariateID(tokens[1]));
125 columns.Push(pcCovariate);
129 columnHash.Push(GetStringID(tokens[1]));
130 columns.Push(pcString);
134 i = (int) tokens[0].SubStr(1);
138 columns.Push(pcSkip);
145 columns.Push(pcZygosity);
149 GetMarkerID(tokens[1]);
155 if (toupper(tokens[0][1]) ==
'T' && toupper(tokens[0][2]) ==
'C')
157 int c = GetCovariateID(tokens[1]);
158 int t = GetTraitID(tokens[1]);
160 if (c >= 32767 || t >= 32767)
161 error(
"Internal error processing data file\n");
163 columnHash.Push(t * 32768 + c);
164 columns.Push(pcUndocumentedTraitCovariate);
169 error(
"Problem in data file (line %d):\n%s\n",
170 line, (
const char *) buffer);
178void PedigreeDescription::Load(
const char * iFilename,
bool warnIfLinkage)
184 "The datafile %s cannot be opened\n\n"
185 "Common causes for this problem are:\n"
186 " * You might not have used the correct options to specify input file names,\n"
187 " please check the program documentation for information on how to do this\n\n"
188 " * The file doesn't exist or the filename might have been misspelt\n\n"
189 " * The file exists but it is being used by another program which you will need\n"
190 " to close before continuing\n\n"
191 " * The file is larger than 2GB and you haven't compiled this application with\n"
192 " large file support.\n\n",
195 Load(f, warnIfLinkage);
198 filename = iFilename;
201void PedigreeDescription::LoadMap(
const char * iFilename)
207 "The mapfile %s cannot be opened\n\n"
208 "Please check that the file exists and is not being used by another program\n"
209 "To find out how to set input filenames, check the documentation\n",
216void PedigreeDescription::LoadMap(
IFILE & input)
222 int lastposition = 0;
226 buffer.ReadLine(input);
227 tokens.AddTokens(buffer, WHITESPACE);
229 while (tokens.Length() == 0 && !
ifeof(input))
231 buffer.ReadLine(input);
232 tokens.AddTokens(buffer, WHITESPACE);
235 if (tokens.Length() != 3)
236 error(
"Error reading map file header, which has %d columns.\n"
237 "Three columns were expected, corresponding to\n"
238 "MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION\n"
239 "The offending header is transcribed below:\n\n"
240 "%s", tokens.Length(), (
const char *) buffer);
242 printf(
"Map file column labels\n"
243 " -- COLUMN 1, Expecting MARKER_ID, Read %s\n"
244 " -- COLUMN 2, Expecting MARKER_NAME, Read %s\n"
245 " -- COLUMN 3, Expecting BASE_PAIR_POSITION, Read %s\n\n",
246 (
const char *)(tokens[0]), (
const char *)(tokens[1]),
247 (
const char *)(tokens[2]));
250 while (!
ifeof(input))
255 buffer.ReadLine(input);
259 tokens.AddTokens(buffer, WHITESPACE);
261 if (tokens.Length() < 1)
continue;
262 if (tokens.Length() != 3)
263 error(
"Each line in the map file should have 3 tokens, corresponding\n"
264 "to MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION respectively\n"
265 "However, there are %d tokens in line %d, transcribed below:\n\n"
266 "%s", tokens.Length(), line, (
const char *) buffer);
268 serial = (int) tokens[0];
269 if (serial != columnCount + 1)
270 error(
"Reading Marker Index from Map File...\n"
271 "Markers should be indexed consecutively starting at 1\n"
272 "Marker %d does not fit this pattern\n", columnCount + 1);
274 position = (int) tokens[2];
275 if (position < lastposition)
276 error(
"Reading Marker Position from Map File...\n"
277 "Marker position should be in base-pairs\n"
278 "and markers should be in map order\n");
281 lastposition = position;
283 columnHash.Push(GetMarkerID(tokens[1]));
284 columns.Push(pcMarker);
287 GetMarkerInfo(tokens[1])->position = position * 1e-8;
294int PedigreeDescription::CountTextColumns()
298 for (
int i = 0; i < columnCount; i++, count++)
299 if (columns[i] == pcMarker)
305void PedigreeDescription::LoadLinkageDataFile(
const char * iFilename)
311 "The linkage format datafile %s cannot be opened\n\n"
312 "Please check that the file exists and is not being used by another program\n"
313 "To find out how to set input filenames, check the documentation\n",
316 LoadLinkageDataFile(f);
319 filename = iFilename;
322void PedigreeDescription::LoadLinkageDataFile(
IFILE & input)
331 ReadLineHelper(input, buffer, tokens);
333 if (tokens.Length() != 4 || tokens[2].AsInteger() != (
int) chromosomeX ||
334 tokens[0].AsInteger() < 0)
335 error(
"Cannot handle first line of data file\n\n"
336 "Expecting four (4) numeric values, which correspond to:\n"
337 " num-loci -- number of loci in the pedigree\n"
338 " this value must be positive\n"
339 " risk-locus -- locus for which risks should be calculated\n"
340 " this value will be ignored\n"
341 " sex-link -- are the loci sex linked [0 - No, 1 - Yes]\n"
343 " program -- which LINKAGE program do you want to use?\n"
344 " this value will also be ignored\n\n"
345 "The actual input read:\n%s\n",
346 chromosomeX ?
"expecting X-linked data, so this value must be ONE (1)"
347 :
"expecting autosomal data, so this must be ZERO (0)",
348 (const char *) buffer);
350 int numloci = tokens[0];
352 ReadLineHelper(input, buffer, tokens);
354 if (tokens.Length() != 4 ||
355 tokens[0].AsInteger() != 0 ||
356 tokens[3].AsInteger() != 0)
357 error(
"Cannot handle second line of data file\n\n"
358 "Expecting four (4) numeric values, which correspond to:\n"
359 " mutation-model -- must be zero, corresponding to no mutation\n"
360 " male-mutation-rate -- ignored\n"
361 " female-mutation-rate -- ignored\n"
362 " linkage-disequilibrium -- must be zero, may be used in the future to\n"
363 " read haplotype frequencies\n\n"
364 "The actual input read:\n%s\n", (
const char *) buffer);
369 ReadLineHelper(input, buffer, markerOrder);
371 if (markerOrder.Length() > numloci)
372 error(
"The third line of the data file lists marker order\n\n"
373 "Although %d loci are defined [in the first line],\n"
374 "this line includes %d values:\n%s\n",
375 numloci, markerOrder.Length(), (
const char *) buffer);
378 bool need_blank_line =
false;
380 while (!
ifeof(input) && numloci--)
382 if (ReadLineHelper(input, buffer, tokens) == 0)
383 error(
"Linkage data file ends unexpectedly");
385 if (tokens.Length() < 2)
386 error(
"Incomplete locus information in data file\n"
387 "Information for each locus should include 2 or more fiels\n"
388 "The expected fields are:\n"
389 " field_type -- indicator of locus type (trait, marker,...)\n"
390 " alleles -- number of alleles\n"
391 " name -- locus name, preceded by hash (#) sign\n\n"
392 "The actual input read:\n%s\n", (
const char *) buffer);
394 int locus_type = (int) tokens[0];
395 int alleles = (int) tokens[1];
397 String locus_name(
"LOCUS");
398 locus_name += ++unknown;
400 if (tokens.Length() > 2 && tokens[2][0] ==
'#')
402 if (tokens[2][1] != 0)
403 locus_name = tokens[2].SubStr(1);
404 else if (tokens.Length() > 3)
405 locus_name = tokens[3];
408 if ((locus_type == 4 && alleles == 0) ||
409 (locus_type == 4 && alleles == 1))
411 columnHash.Push(GetCovariateID(locus_name));
412 columns.Push(pcCovariate);
417 if (locus_type == 0 && alleles == 0)
419 columnHash.Push(GetTraitID(locus_name));
420 columns.Push(pcTrait);
425 if (ReadLineHelper(input, buffer, tokens) != alleles)
426 error(
"Expecting %d allele frequencies, but input has %d columns:\n"
427 "%s\n", alleles, tokens.Length(), (
const char *) buffer);
429 Vector frequencies(alleles + 1);
431 frequencies[0] = 0.0;
432 for (
int i = 1; i <= alleles; i++)
433 frequencies[i] = (
double) tokens[i - 1];
435 double sum = frequencies.Sum();
438 error(
"Allele frequencies at %s sum to %f, which doesn't make sense\n",
439 (
const char *) locus_name, sum);
441 if (fabs(sum - 1.0) > 1.2e-5)
443 printf(
"Allele frequencies at %s sum to %f, adjusted to 1.0\n",
444 (
const char *) locus_name, sum);
445 need_blank_line =
true;
449 frequencies *= 1.0 / sum;
456 columnHash.Push(GetAffectionID(locus_name));
457 columns.Push(pcAffection);
461 if (ReadLineHelper(input, buffer, tokens) == 0)
462 error(
"Linkage data file ends unexpectedly\n");
465 int classes = tokens[0];
469 columns.Push(pcSkip);
474 if (chromosomeX) classes *= 2;
477 if (ReadLineHelper(input, buffer, tokens) == 0)
478 error(
"Linkage data file ends unexpectedly\n");
486 columnHash.Push(GetMarkerID(locus_name));
487 columns.Push(pcMarker);
491 MarkerInfo * info = GetMarkerInfo(locus_name);
493 info->freq = frequencies;
496 info->alleleLabels.Clear();
497 for (
int i = 0; i < frequencies.Length(); i++)
498 info->alleleLabels.Push(label = i);
499 info->IndexAlleles();
502 locus.Push(GetMarkerID(locus_name));
508 if (ReadLineHelper(input, buffer, tokens) == 0)
509 error(
"Linkage data file ends unexpectedly\n");
513 for (
int vars = tokens[0], i = 0; i < vars; i++)
515 if (ReadLineHelper(input, buffer, tokens) == 0)
516 error(
"Linkage data file ends unexpectedly\n");
518 String trait_name(locus_name);
526 columnHash.Push(GetTraitID(trait_name));
527 columns.Push(pcTrait);
532 if (ReadLineHelper(input, buffer, tokens) == 0)
533 error(
"Linkage data file ends unexpectedly\n");
536 if (ReadLineHelper(input, buffer, tokens) == 0)
537 error(
"Linkage data file ends unexpectedly\n");
544 error(
"The data file includes binary factors\n"
545 "Regretably, loci of this type are not supported\n\n");
548 error(
"Unsupported locus type [%d] in data file", locus_type);
553 if (need_blank_line) printf(
"\n");
558 ReadLineHelper(input, buffer, tokens);
559 int sexDifference = tokens.Length() ? tokens[0].AsInteger() : -1;
560 if (tokens.Length() != 2 ||
561 (sexDifference != 0 && sexDifference != 2) ||
562 tokens[1].AsInteger() != 0)
563 error(
"Error retrieving recombination information\n\n"
564 "Expecting two (2) numeric values, which correspond to:\n"
565 " sex-difference -- must be zero (no difference) or two (sex specific recombination)\n"
566 " map-function -- must be zero, that is, no interference\n"
567 "The actual input read:\n%s\n", (
const char *) buffer);
570 bool distance_in_centimorgans =
false;
572 for (
int r = 0; r <= sexDifference; r += 2)
574 ReadLineHelper(input, buffer, tokens);
575 if (tokens.Length() != markerOrder.Length() - 1)
576 error(
"Error retrieving recombination information\n\n"
577 "Expecting %d recombination fractions (current map includes %d loci)\n"
578 "Instead the following line was input:\n%s\n",
579 markerOrder.Length() - 1, markerOrder.Length(), (
const char *) buffer);
581 distances[r >> 1].Dimension(tokens.Length());
582 for (
int i = 0; i < tokens.Length(); i++)
583 distances[r >> 1][i] = (
double) tokens[i];
585 if (distances[r >> 1].Min() < 0.0)
586 error(
"Linkage datafile specifies negative recombination fractions");
588 bool centimorgans = distances[r >> 1].Max() > 0.5;
590 if (centimorgans && !distance_in_centimorgans)
591 printf(
" Some recombination fractions in datafile are greater than 0.5,\n"
592 " so recombination fractions will be interpreted as cM distances\n\n");
594 distance_in_centimorgans |= centimorgans;
597 double position = 0.0, positionMale = 0.0;
599 for (
int i = 0, moving =
false; i < markerOrder.Length(); i++)
601 int m = markerOrder[i].AsInteger() - 1;
603 if (m < 0 || m >= locus.Length())
604 error(
"The marker order in the linkage datafile is invalid\n");
611 info->chromosome = chromosomeX ? 9999 : 0;
613 if (sexDifference == 2)
614 info->position = (position + positionMale) * 0.5,
615 info->positionFemale = position,
616 info->positionMale = positionMale;
618 info->position = info->positionMale = info->positionFemale = position;
623 if (i < markerOrder.Length() - 1 && moving)
624 position += distance_in_centimorgans ?
625 0.01 * distances[0][i] : RecombinationToDistance(distances[0][i]);
627 if (sexDifference == 2 && i < markerOrder.Length() - 1 && moving)
628 positionMale += distance_in_centimorgans ?
629 0.01 * distances[1][i] : RecombinationToDistance(distances[1][i]);
633int PedigreeDescription::ReadLineHelper(
IFILE & input,
640 buffer.ReadLine(input);
644 int pos = buffer.FastFind(
">>");
645 if (pos == -1) pos = buffer.FastFind(
"<<");
646 if (pos == -1) pos = buffer.Length() + 1;
647 if (buffer[0] ==
'#') pos = 0;
651 tokens.AddTokens(buffer.Left(pos - 1), WHITESPACE);
654 while (tokens.Length() == 0 && !
ifeof(input));
656 return tokens.Length();
659void PedigreeDescription::LoadMendelDataFile(
const char * iFilename)
665 "The MENDEL format datafile %s cannot be opened\n\n"
666 "Please check that the file exists and is not being used by another program\n"
667 "To find out how to set input filenames, check the documentation\n",
670 LoadMendelDataFile(f);
674void PedigreeDescription::LoadMendelDataFile(
IFILE & file)
701 parser.SetInputFile(file);
702 parser.SetFormat(
"(2A8,I2,I3)");
706 parser.GetNextField(locusName);
707 parser.GetNextField(locusType);
708 alleleCount = parser.GetNextInteger();
709 phenoCount = parser.GetNextInteger();
711 if (locusName.IsEmpty() && locusType.IsEmpty() && alleleCount == 0 &&
712 phenoCount == 0 &&
ifeof(file))
716 if (locusType.Compare(
"AUTOSOME") != 0 && locusType.Compare(
"X-LINKED"))
717 error(
"Unrecognized locus type '%s' in Mendel data file\n\n"
718 "Recognized locus types are \"AUTOSOME\" and \"X-LINKED\".",
719 (
const char *) locusType);
721 if (locusType.Compare(
"AUTOSOME") == 0 && chromosomeX)
722 error(
"The data file indicates that locus %s is AUTOSOMAL, but\n"
723 "X-LINKED loci were expected as input\n",
724 (
const char *) locusName);
726 if (locusType.Compare(
"X-LINKED") == 0 && !chromosomeX)
727 error(
"The data file indicates that locus %s is X-LINKED, but\n"
728 "AUTOSOMAL loci were expected as input\n",
729 (
const char *) locusName);
731 if (locusName.IsEmpty())
732 error(
"Blank locus name encountered in data file\n");
737 columns.Push(pcMarker);
738 columnHash.Push(GetMarkerID(locusName));
744 info->alleleLabels.Clear();
745 info->alleleLabels.Push(
"");
748 parser.SetFormat(
"(2A8)");
752 for (
int i = 0; i < alleleCount; i++)
754 parser.GetNextField(alleleLabel);
755 parser.GetNextField(alleleFreq);
757 if (alleleLabel.IsEmpty())
758 error(
"Locus %s is missing allele label for allele #%d\n",
759 (
const char *) locusName, i+1);
761 info->alleleLabels.Push(alleleLabel);
763 if (!alleleFreq.IsEmpty())
765 if (info->freq.Length() == 0)
766 info->freq.Push(0.0);
768 info->freq.Push(alleleFreq.AsDouble());
771 info->IndexAlleles();
773 if (info->alleleLabels.Length() != info->freq.Length() &&
774 info->freq.Length() != 0)
775 error(
"Locus %s is missing allele frequency information for %d alleles\n",
776 (
const char *) locusName,
777 info->alleleLabels.Length() - info->freq.Length());
782 parser.SetFormat(
"(2A8)");
785 for (
int i = 0; i < alleleCount; i++)
787 parser.GetNextField(alleleLabel);
788 parser.GetNextField(alleleFreq);
792 for (
int i = 0; i < alleleCount; i++)
794 parser.SetFormat(
"(A8,I3)");
795 parser.GetNextField(phenotype);
796 int genoCount = parser.GetNextInteger();
798 parser.SetFormat(
"(A17)");
799 for (
int j = 0; j < genoCount; j++)
800 parser.GetNextField(genotype);
802 columns.Push(pcAffection);
803 columnHash.Push(GetAffectionID(locusName +
"->" + phenotype));
813int PedigreeDescription::CountColumns(
int type)
817 for (
int i = 0; i < columns.Length(); i++)
818 if (columns[i] == type)
824const char * PedigreeDescription::ColumnSummary(
String &
string)
827 UpdateSummary(
string, pcMarker,
" markers [x2 cols]");
828 UpdateSummary(
string, pcTrait,
" traits");
829 UpdateSummary(
string, pcAffection,
" discrete traits");
830 UpdateSummary(
string, pcCovariate,
" covariates");
831 UpdateSummary(
string, pcString,
" strings");
832 UpdateSummary(
string, pcZygosity,
" zygosity");
833 UpdateSummary(
string, pcSkip,
" skipped");
837void PedigreeDescription::UpdateSummary(
String &
string,
int type,
const char * label)
839 int count = CountColumns(type);
851void PedigreeDescription::AddMarkerColumn(
const char * markerName)
853 if (columns.Last() == pcEnd)
859 columnHash.Push(GetMarkerID(markerName));
860 columns.Push(pcMarker);
864void PedigreeDescription::AddCovariateColumn(
const char * covariateName)
866 if (columns.Last() == pcEnd)
872 columnHash.Push(GetCovariateID(covariateName));
873 columns.Push(pcCovariate);
877void PedigreeDescription::AddTraitColumn(
const char * traitName)
879 if (columns.Last() == pcEnd)
885 columnHash.Push(GetCovariateID(traitName));
886 columns.Push(pcTrait);
890void PedigreeDescription::AddAffectionColumn(
const char * affectionName)
892 if (columns.Last() == pcEnd)
898 columnHash.Push(GetAffectionID(affectionName));
899 columns.Push(pcAffection);
903void PedigreeDescription::AddStringColumn(
const char * stringName)
905 if (columns.Last() == pcEnd)
911 columnHash.Push(GetStringID(stringName));
912 columns.Push(pcString);
916void PedigreeDescription::AddZygosityColumn()
918 if (columns.Last() == pcEnd)
925 columns.Push(pcZygosity);
929void PedigreeDescription::AddSkippedColumn()
931 if (columns.Last() == pcEnd)
938 columns.Push(pcSkip);