Skip to content

Commit

Permalink
Merge pull request cBioPortal#9102 from thehyve/unify_ref_genome_hand…
Browse files Browse the repository at this point in the history
…ling

fix bug in importer and revisit ref genome in validator
  • Loading branch information
inodb authored Jan 3, 2022
2 parents 0bf166d + 607cf8e commit d7b33c5
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,11 @@ public void importData() throws IOException, DaoException {
GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);

CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByInternalId(geneticProfile.getCancerStudyId());
String genomeBuildName;
String referenceGenome = cancerStudy.getReferenceGenome();
if (referenceGenome == null) {
genomeBuildName = GlobalProperties.getReferenceGenomeName();
} else {
genomeBuildName = DaoReferenceGenome.getReferenceGenomeByGenomeName(referenceGenome).getBuildName();
referenceGenome = GlobalProperties.getReferenceGenomeName();
}
String genomeBuildName = DaoReferenceGenome.getReferenceGenomeByGenomeName(referenceGenome).getBuildName();

while((line=buf.readLine()) != null)
{
Expand Down
73 changes: 40 additions & 33 deletions core/src/main/scripts/importer/validateData.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,8 @@ def __init__(self, portal_info_dict, cancer_type_dict, hugo_entrez_map, alias_en

#Set defaults for genome version and species
self.__species = 'human'
self.__ncbi_build = '37'
self.__genome_build = 'hg19'
self.__ncbi_build = 'GRCh37'
self.__genome_name = 'hg19'

# determine version, and the reason why it might be unknown
if portal_info_dict is None:
Expand All @@ -328,23 +328,26 @@ def species(self):

@species.setter
def species(self, species):
self.__species= species
self.__species = species

@property
def genome_build(self):
return self.__genome_build
def reference_genome(self):
return self.__genome_name

@genome_build.setter
def genome_build(self, genome_build):
self.__genome_build= genome_build
@reference_genome.setter
def reference_genome(self, genome_name):
self.__genome_name = genome_name

@property
def ncbi_build(self):
return self.__ncbi_build

@ncbi_build.setter
def ncbi_build(self, ncbi_build):
self.__ncbi_build = ncbi_build
def ncbi_build(self, ncbi_build):
prefix = 'GRCm' if self.__species == 'mouse' else 'GRCh'
if str(ncbi_build) in ('37', '38'):
ncbi_build = prefix + str(ncbi_build)
self.__ncbi_build = ncbi_build

class Validator(object):

Expand Down Expand Up @@ -931,7 +934,7 @@ def _checkRepeatedColumns(self):
return num_errors

@staticmethod
def load_chromosome_lengths(genome_build, logger):
def load_chromosome_lengths(reference_genome, logger):

"""Get the length of each chromosome and return a dict.
Expand All @@ -951,9 +954,10 @@ def load_chromosome_lengths(genome_build, logger):
chrom_size_file)

try:
chrom_size_dict = chrom_sizes[genome_build]
chrom_size_dict = chrom_sizes[reference_genome]
except KeyError:
raise KeyError('Could not load chromosome sizes for genome build %s.' % genome_build)
raise KeyError('Could not load chromosome sizes for genome build %s. Expecting one of '
'["hg19", "hg38", "mm10"]' % reference_genome)

return chrom_size_dict

Expand Down Expand Up @@ -1985,20 +1989,20 @@ def checkOffPanelVariant(self, data, normalized_gene, panel_id, hugo_symbol, ent


def checkNCBIbuild(self, value):
"""
Checks whether the value found in MAF NCBI_Build column matches the genome specified in portal.properties at
field ncbi.build. Expecting GRCh37, GRCh38, GRCm38 or without the GRCx prefix
"""

if value != '':
# based on MutationDataUtils.getNcbiBuild
if self.portal.species == "human":
if value not in [str(self.portal.ncbi_build), self.portal.genome_build, 'GRCh'+str(self.portal.ncbi_build)]:
self.logger.error('The specified reference genome does not correspond with the reference genome found in the MAF.',
extra={'line_number': self.line_number,
'cause':value})
return False
elif self.portal.species == "mouse":
if value not in [str(self.portal.ncbi_build), self.portal.genome_build, 'GRCm'+str(self.portal.ncbi_build)]:
self.logger.error('The specified reference genome does not correspond with the reference genome found in the MAF.',
extra={'line_number': self.line_number,
'cause':value})
return False
prefix = 'GRCm' if self.portal.species == 'mouse' else 'GRCh'
if str(value) in ('37', '38'):
value = prefix + str(value)
if value != str(self.portal.ncbi_build):
self.extra = 'The reference genome in column NCBI_Build does not correspond with the ' \
'reference genome specified for this study (%s)' % (self.portal.ncbi_build)
self.extra_exists = True
return False
return True

def checkMatchedNormSampleBarcode(self, value):
Expand Down Expand Up @@ -4662,9 +4666,9 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str

# implemented ref genomes
reference_genome_map = {
'hg19':('human','37','hg19'),
'hg38':('human','38','hg38'),
'mm10':('mouse','38','mm10')
'hg19': ('human', 'GRCh37', 'hg19'),
'hg38': ('human', 'GRCh38', 'hg38'),
'mm10': ('mouse', 'GRCm38', 'mm10')
}

DISALLOWED_CHARACTERS = r'[^A-Za-z0-9_-]'
Expand Down Expand Up @@ -4732,17 +4736,20 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str
meta_dictionary['add_global_case_list'].lower() == 'true'):
case_list_suffix_fns['all'] = filename

# if reference_genome is specified in the meta file, override the defaults in portal properties
if 'reference_genome' in meta_dictionary:
if meta_dictionary['reference_genome'] not in reference_genome_map:
logger.error('Unknown reference genome defined. Should be one of %s' %
list(reference_genome_map.keys()),
extra={
'filename_':filename,
'cause':meta_dictionary['reference_genome'].strip()
'filename_': filename,
'cause': meta_dictionary['reference_genome'].strip()
})
else:
portal_instance.species, portal_instance.ncbi_build, portal_instance.genome_build = \
reference_genome_map[meta_dictionary['reference_genome']]
genome_info = reference_genome_map[meta_dictionary['reference_genome']]
logger.info('Setting reference genome to %s (%s, %s)' % genome_info,
extra={'filename_': filename})
portal_instance.species, portal_instance.ncbi_build, portal_instance.reference_genome = genome_info
else:
logger.info('No reference genome specified -- using default (hg19)',
extra={'filename_': filename})
Expand Down
10 changes: 9 additions & 1 deletion core/src/test/scripts/test_data/study_es_0/result_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -3097,7 +3097,7 @@ <h4 class="panel-title">meta_structural_variants.txt</h4>
<a role="button" data-toggle="collapse" aria-expanded="true"
href="#collapse_bWV0YV9zdHVkeS50eHQ",
aria-controls="collapse_bWV0YV9zdHVkeS50eHQ">
<span class="badge">2</span>
<span class="badge">3</span>
<h4 class="panel-title">meta_study.txt</h4>
</a>
</div>
Expand Down Expand Up @@ -3133,6 +3133,14 @@ <h4 class="panel-title">meta_study.txt</h4>
<td>&ndash;</td>
<td></td>
</tr>
<tr class="success">
<td><span class="glyphicon glyphicon-ok" aria-hidden="true"></span><span class="sr-only">Info</span></td>
<td>&ndash;</td>
<td>&ndash;</td>
<td>Setting reference genome to human (GRCh37, hg19)</td>
<td>&ndash;</td>
<td></td>
</tr>
</tbody>
</table>
</div>
Expand Down
12 changes: 2 additions & 10 deletions core/src/test/scripts/test_data/study_quotes/result_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ <h4 class="panel-title">General</h4>
<a role="button" data-toggle="collapse" aria-expanded="true"
href="#collapse_YnJjYV90Y2dhX3B1Yi5tYWY",
aria-controls="collapse_YnJjYV90Y2dhX3B1Yi5tYWY">
<span class="badge">9</span>
<span class="badge">8</span>
<h4 class="panel-title">brca_tcga_pub.maf</h4>
</a>
</div>
Expand Down Expand Up @@ -220,19 +220,11 @@ <h4 class="panel-title">brca_tcga_pub.maf</h4>
<td>Silent</td>
<td></td>
</tr>
<tr class="danger">
<td><span class="glyphicon glyphicon-remove" aria-hidden="true"></span><span class="sr-only">Error</span></td>
<td>8</td>
<td>&ndash;</td>
<td>The specified reference genome does not correspond with the reference genome found in the MAF.</td>
<td>GRCh37&#34;</td>
<td></td>
</tr>
<tr class="danger">
<td><span class="glyphicon glyphicon-remove" aria-hidden="true"></span><span class="sr-only">Error</span></td>
<td>8</td>
<td>4</td>
<td>Value in column &#39;NCBI_Build&#39; is invalid</td>
<td>The reference genome in column NCBI_Build does not correspond with the reference genome specified for this study (GRCh37)</td>
<td>GRCh37&#34;</td>
<td></td>
</tr>
Expand Down

0 comments on commit d7b33c5

Please sign in to comment.