Skip to content

Commit

Permalink
Merge pull request #4656 from broadinstitute/codacy-errs
Browse files Browse the repository at this point in the history
Fix outstanding codacy errors
  • Loading branch information
hanars authored Feb 18, 2025
2 parents e7f74ef + d545488 commit 80f2037
Show file tree
Hide file tree
Showing 12 changed files with 145 additions and 123 deletions.
2 changes: 1 addition & 1 deletion matchmaker/views/external_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _check_mme_authenticated(self, url):
self.assertEqual(response.status_code, 401)

response = self.client.get(
url, HTTP_ACCEPT='application/vnd.ga4gh.matchmaker.v1.0+json', HTTP_X_AUTH_TOKEN='invalid',
url, HTTP_ACCEPT='application/vnd.ga4gh.matchmaker.v1.0+json', HTTP_X_AUTH_TOKEN='invalid', # nosec
)
self.assertEqual(response.status_code, 401)

Expand Down
1 change: 0 additions & 1 deletion seqr/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from abc import abstractmethod
import uuid
import json
import random
Expand Down
2 changes: 1 addition & 1 deletion seqr/utils/search/add_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def add_new_es_search_samples(request_json, project, user, notify=False, expecte
request_json['mappingFilePath'], user) if request_json.get('mappingFilePath') else {}
ignore_extra_samples = request_json.get('ignoreExtraSamplesInCallset')
sample_project_tuples = [(sample_id, project.name) for sample_id in sample_ids]
updated_samples, new_samples, inactivated_sample_guids, num_skipped, updated_family_guids = match_and_update_search_samples(
updated_samples, new_samples, inactivated_sample_guids, updated_family_guids = match_and_update_search_samples(
projects=[project],
user=user,
sample_project_tuples=sample_project_tuples,
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/data_manager_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from seqr.models import Sample, RnaSample, Individual, Project, PhenotypePrioritization

from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD, KIBANA_ELASTICSEARCH_USER, \
SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, BASE_URL, LOADING_DATASETS_DIR, PIPELINE_RUNNER_SERVER, \
SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, LOADING_DATASETS_DIR, PIPELINE_RUNNER_SERVER, \
LUIGI_UI_SERVICE_HOSTNAME, LUIGI_UI_SERVICE_PORT

logger = SeqrLogger(__name__)
Expand Down
48 changes: 25 additions & 23 deletions seqr/views/apis/family_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from django.contrib.postgres.aggregates import ArrayAgg
from django.db.models import Count, Max, Q
from django.db.models.fields.files import ImageFieldFile
from django.db.models.functions import JSONObject, Concat, Upper, Substr

from matchmaker.models import MatchmakerSubmission
from reference_data.models import Omim
Expand All @@ -22,7 +21,7 @@
from seqr.views.utils.project_context_utils import add_families_context, families_discovery_tags, add_project_tag_types, \
MME_TAG_NAME
from seqr.models import Family, FamilyAnalysedBy, Individual, FamilyNote, Sample, VariantTag, AnalysisGroup, RnaSeqTpm, \
PhenotypePrioritization, Project, RnaSeqOutlier, RnaSeqSpliceOutlier, RnaSample
PhenotypePrioritization, Project, RnaSample
from seqr.views.utils.permissions_utils import check_project_permissions, get_project_and_check_pm_permissions, \
login_and_policies_required, user_is_analyst, has_case_review_permissions, external_anvil_project_can_edit
from seqr.views.utils.variant_utils import get_phenotype_prioritization, get_omim_intervals_query, DISCOVERY_CATEGORY
Expand Down Expand Up @@ -391,27 +390,7 @@ def receive_families_table_handler(request, project_guid):
project = get_project_and_check_pm_permissions(project_guid, request.user)

def _process_records(records, filename=''):
column_map = {}
for i, field in enumerate(records[0]):
key = field.lower()
if 'family' in key:
if 'prev' in key:
column_map[PREVIOUS_FAMILY_ID_FIELD] = i
else:
column_map[FAMILY_ID_FIELD] = i
elif 'display' in key:
column_map['displayName'] = i
elif 'phenotype' in key:
column_map['codedPhenotype'] = i
elif 'mondo' in key and 'id' in key:
column_map['mondoId'] = i
elif 'description' in key:
column_map['description'] = i
elif 'external' in key and 'data' in key:
column_map['externalData'] = i
if FAMILY_ID_FIELD not in column_map:
raise ValueError('Invalid header, missing family id column')

column_map = _get_family_column_map(records[0])
parsed_records = [{column: PARSE_FAMILY_TABLE_FIELDS.get(column, lambda v: v)(row[index])
for column, index in column_map.items()} for row in records[1:]]
family_ids = [r.get(PREVIOUS_FAMILY_ID_FIELD) or r[FAMILY_ID_FIELD] for r in parsed_records]
Expand Down Expand Up @@ -452,6 +431,29 @@ def _process_records(records, filename=''):
'info': info,
})

def _get_family_column_map(record):
column_map = {}
for i, field in enumerate(record):
key = field.lower()
if 'family' in key:
if 'prev' in key:
column_map[PREVIOUS_FAMILY_ID_FIELD] = i
else:
column_map[FAMILY_ID_FIELD] = i
elif 'display' in key:
column_map['displayName'] = i
elif 'phenotype' in key:
column_map['codedPhenotype'] = i
elif 'mondo' in key and 'id' in key:
column_map['mondoId'] = i
elif 'description' in key:
column_map['description'] = i
elif 'external' in key and 'data' in key:
column_map['externalData'] = i
if FAMILY_ID_FIELD not in column_map:
raise ValueError('Invalid header, missing family id column')
return column_map

@login_and_policies_required
def create_family_note(request, family_guid):
family = Family.objects.get(guid=family_guid)
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/igv_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests

from django.core.exceptions import PermissionDenied
from django.http import StreamingHttpResponse, HttpResponse
from django.http import StreamingHttpResponse

from seqr.models import Individual, IgvSample
from seqr.utils.file_utils import file_iter, does_file_exist, is_google_bucket_file_path, run_command, get_google_project
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/project_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from seqr.views.utils.orm_to_json_utils import _get_json_for_project, get_json_for_samples, \
get_json_for_project_collaborator_list, get_json_for_matchmaker_submissions, \
get_json_for_family_notes, _get_json_for_individuals, get_json_for_project_collaborator_groups, \
FAMILY_ADDITIONAL_VALUES, INDIVIDUAL_GUIDS_VALUES
FAMILY_ADDITIONAL_VALUES
from seqr.views.utils.permissions_utils import get_project_and_check_permissions, check_project_permissions, \
check_user_created_object_permissions, pm_required, user_is_pm, login_and_policies_required, \
has_workspace_perm, has_case_review_permissions, is_internal_anvil_project
Expand Down
57 changes: 34 additions & 23 deletions seqr/views/apis/report_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,30 +422,14 @@ def _add_row(row, family_id, row_type):
missing_airtable_data_types = defaultdict(list)
missing_seqr_data_types = defaultdict(list)
for participant in participant_rows:
phenotype_rows += _parse_participant_phenotype_rows(participant)
analyte = {k: participant.pop(k) for k in [SMID_FIELD, *ANALYTE_TABLE_COLUMNS[2:]]}
analyte['participant_id'] = participant['participant_id']

if not participant[PARTICIPANT_ID_FIELD]:
missing_participant_ids.append(participant['participant_id'])
continue

airtable_participant_id = participant.pop(PARTICIPANT_ID_FIELD)
airtable_metadata = airtable_metadata_by_participant.get(airtable_participant_id)
if not airtable_metadata:
missing_airtable.append(airtable_participant_id)
continue

seqr_data_types = set(grouped_data_type_individuals[participant['participant_id']].keys())
airtable_data_types = {dt.upper() for dt in GREGOR_DATA_TYPES if dt.upper() in airtable_metadata}
for data_type in seqr_data_types - airtable_data_types:
missing_airtable_data_types[data_type].append(airtable_participant_id)
for data_type in airtable_data_types - seqr_data_types:
missing_seqr_data_types[data_type].append(airtable_participant_id)
_parse_participant_airtable_rows(
analyte, airtable_metadata, seqr_data_types.intersection(airtable_data_types), experiment_ids_by_participant,
analyte_rows, airtable_rows, experiment_lookup_rows,
airtable_args = _process_participant_row(
participant, phenotype_rows, missing_participant_ids, airtable_metadata_by_participant,
missing_airtable, grouped_data_type_individuals, missing_airtable_data_types, missing_seqr_data_types,
)
if airtable_args:
_parse_participant_airtable_rows(
*airtable_args, experiment_ids_by_participant, analyte_rows, airtable_rows, experiment_lookup_rows,
)

errors = []
if missing_participant_ids:
Expand Down Expand Up @@ -497,6 +481,33 @@ def _add_row(row, family_id, row_type):
})


def _process_participant_row(participant, phenotype_rows, missing_participant_ids, airtable_metadata_by_participant,
missing_airtable, grouped_data_type_individuals, missing_airtable_data_types,
missing_seqr_data_types):
phenotype_rows += _parse_participant_phenotype_rows(participant)
analyte = {k: participant.pop(k) for k in [SMID_FIELD, *ANALYTE_TABLE_COLUMNS[2:]]}
analyte['participant_id'] = participant['participant_id']

if not participant[PARTICIPANT_ID_FIELD]:
missing_participant_ids.append(participant['participant_id'])
return None

airtable_participant_id = participant.pop(PARTICIPANT_ID_FIELD)
airtable_metadata = airtable_metadata_by_participant.get(airtable_participant_id)
if not airtable_metadata:
missing_airtable.append(airtable_participant_id)
return None

seqr_data_types = set(grouped_data_type_individuals[participant['participant_id']].keys())
airtable_data_types = {dt.upper() for dt in GREGOR_DATA_TYPES if dt.upper() in airtable_metadata}
for data_type in seqr_data_types - airtable_data_types:
missing_airtable_data_types[data_type].append(airtable_participant_id)
for data_type in airtable_data_types - seqr_data_types:
missing_seqr_data_types[data_type].append(airtable_participant_id)

return analyte, airtable_metadata, seqr_data_types.intersection(airtable_data_types)


def _get_individual_data_types(projects):
sample_types = Sample.objects.filter(individual__family__project__in=projects).values_list('individual_id', 'sample_type')
individual_data_types = defaultdict(set)
Expand Down
1 change: 0 additions & 1 deletion seqr/views/apis/summary_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from django.db.models import CharField, F, Value
from django.db.models.functions import Coalesce, Concat, JSONObject, NullIf
import json
from random import randint

from matchmaker.matchmaker_utils import get_mme_gene_phenotype_ids_for_submissions, parse_mme_features, \
get_mme_metrics, get_hpo_terms_by_id
Expand Down
100 changes: 54 additions & 46 deletions seqr/views/utils/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from django.contrib.postgres.aggregates import ArrayAgg
from django.db.models import Count, F, Q
from django.db.models import F, Q
from django.utils import timezone
from tqdm import tqdm

Expand Down Expand Up @@ -102,7 +102,7 @@ def _find_or_create_samples(
)
samples_guids += [s.guid for s in new_sample_models]

return samples_guids, individual_ids, remaining_sample_keys, loaded_date
return samples_guids, individual_ids, loaded_date


def _create_samples(sample_data, user, loaded_date=timezone.now(), **kwargs):
Expand Down Expand Up @@ -196,7 +196,7 @@ def match_and_update_search_samples(
projects, sample_project_tuples, sample_type, dataset_type, sample_data, user, expected_families=None,
sample_id_to_individual_id_mapping=None, raise_unmatched_error_template='Matches not found for sample ids: {sample_ids}',
):
samples_guids, individual_ids, remaining_sample_keys, loaded_date = _find_or_create_samples(
samples_guids, individual_ids, loaded_date = _find_or_create_samples(
sample_project_tuples=sample_project_tuples,
projects=projects,
user=user,
Expand Down Expand Up @@ -225,7 +225,7 @@ def match_and_update_search_samples(
previous_loaded_individuals = set(Sample.objects.filter(guid__in=inactivated_sample_guids).values_list('individual_id', flat=True))
new_samples = dict(updated_samples.exclude(individual_id__in=previous_loaded_individuals).values_list('id', 'sample_id'))

return updated_samples, new_samples, inactivated_sample_guids, len(remaining_sample_keys), family_guids_to_update
return updated_samples, new_samples, inactivated_sample_guids, family_guids_to_update


def _parse_tsv_row(row):
Expand Down Expand Up @@ -360,48 +360,11 @@ def _load_rna_seq_file(
missing_required_fields = defaultdict(set)
gene_ids = set()
for line in tqdm(parsed_f, unit=' rows'):
row = dict(zip(header, line))

row_dict = {mapped_key: row[col] for mapped_key, col in column_map.items()}

missing_cols = {col_id for col, col_id in required_column_map.items() if not row.get(col)}
sample_id = row_dict.pop(SAMPLE_ID_COL) if SAMPLE_ID_COL in row_dict else row[SAMPLE_ID_COL]
if missing_cols:
for col in missing_cols:
missing_required_fields[col].add(sample_id)
if missing_cols:
continue

if row.get(INDIV_ID_COL) and sample_id not in sample_id_to_individual_id_mapping:
sample_id_to_individual_id_mapping[sample_id] = row[INDIV_ID_COL]

tissue_type = TISSUE_TYPE_MAP[row[TISSUE_COL]]
project = row_dict.pop(PROJECT_COL, None) or row[PROJECT_COL]
sample_key = ((sample_id_to_individual_id_mapping or {}).get(sample_id, sample_id), project, tissue_type)

potential_sample = potential_samples.get(sample_key)
if (potential_sample or {}).get('active'):
loaded_samples.add(potential_sample['guid'])
continue

row_gene_ids = row_dict[GENE_ID_COL].split(';')
if any(row_gene_ids):
gene_ids.update(row_gene_ids)

if potential_sample:
sample_guid_keys_to_load[potential_sample['guid']] = sample_key
else:
_match_new_sample(
sample_key, samples_to_create, unmatched_samples, individual_data_by_key,
)

if missing_required_fields or (unmatched_samples and not ignore_extra_samples) or (sample_key in unmatched_samples):
# If there are definite errors, do not process/save data, just continue to check for additional errors
continue

for gene_id in row_gene_ids:
row_dict = {**row_dict, GENE_ID_COL: gene_id}
save_data(sample_key, row_dict)
_parse_rna_row(
dict(zip(header, line)), column_map, required_column_map, missing_required_fields,
sample_id_to_individual_id_mapping, potential_samples, loaded_samples, gene_ids, sample_guid_keys_to_load,
samples_to_create, unmatched_samples, individual_data_by_key, save_data, ignore_extra_samples,
)

errors, warnings = _process_rna_errors(
gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples,
Expand All @@ -418,6 +381,51 @@ def _load_rna_seq_file(
return warnings, len(loaded_samples) + len(unmatched_samples), sample_guid_keys_to_load, prev_loaded_individual_ids


def _parse_rna_row(row, column_map, required_column_map, missing_required_fields, sample_id_to_individual_id_mapping,
potential_samples, loaded_samples, gene_ids, sample_guid_keys_to_load, samples_to_create,
unmatched_samples, individual_data_by_key, save_data, ignore_extra_samples):
row_dict = {mapped_key: row[col] for mapped_key, col in column_map.items()}

missing_cols = {col_id for col, col_id in required_column_map.items() if not row.get(col)}
sample_id = row_dict.pop(SAMPLE_ID_COL) if SAMPLE_ID_COL in row_dict else row[SAMPLE_ID_COL]
if missing_cols:
for col in missing_cols:
missing_required_fields[col].add(sample_id)
if missing_cols:
return

if row.get(INDIV_ID_COL) and sample_id not in sample_id_to_individual_id_mapping:
sample_id_to_individual_id_mapping[sample_id] = row[INDIV_ID_COL]

tissue_type = TISSUE_TYPE_MAP[row[TISSUE_COL]]
project = row_dict.pop(PROJECT_COL, None) or row[PROJECT_COL]
sample_key = ((sample_id_to_individual_id_mapping or {}).get(sample_id, sample_id), project, tissue_type)

potential_sample = potential_samples.get(sample_key)
if (potential_sample or {}).get('active'):
loaded_samples.add(potential_sample['guid'])
return

row_gene_ids = row_dict[GENE_ID_COL].split(';')
if any(row_gene_ids):
gene_ids.update(row_gene_ids)

if potential_sample:
sample_guid_keys_to_load[potential_sample['guid']] = sample_key
else:
_match_new_sample(
sample_key, samples_to_create, unmatched_samples, individual_data_by_key,
)

if missing_required_fields or (unmatched_samples and not ignore_extra_samples) or (sample_key in unmatched_samples):
# If there are definite errors, do not process/save data, just continue to check for additional errors
return

for gene_id in row_gene_ids:
row_dict = {**row_dict, GENE_ID_COL: gene_id}
save_data(sample_key, row_dict)


def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples):
errors = []
warnings = []
Expand Down
5 changes: 2 additions & 3 deletions seqr/views/utils/orm_to_json_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
get_json_for_sample, get_json_for_saved_variants, get_json_for_variant_note, get_json_for_locus_list, \
get_json_for_saved_searches, get_json_for_saved_variants_with_tags, get_json_for_current_user
from seqr.views.utils.test_utils import AuthenticationTestCase, AnvilAuthenticationTestCase, \
PROJECT_FIELDS, INTERNAL_FAMILY_FIELDS, \
INDIVIDUAL_FIELDS, INTERNAL_INDIVIDUAL_FIELDS, SAMPLE_FIELDS, SAVED_VARIANT_FIELDS, \
PROJECT_FIELDS, SAMPLE_FIELDS, SAVED_VARIANT_FIELDS, \
FUNCTIONAL_FIELDS, SAVED_SEARCH_FIELDS, LOCUS_LIST_DETAIL_FIELDS, PA_LOCUS_LIST_FIELDS, IGV_SAMPLE_FIELDS, \
CASE_REVIEW_FAMILY_FIELDS, TAG_FIELDS, VARIANT_NOTE_FIELDS, NO_INTERNAL_CASE_REVIEW_INDIVIDUAL_FIELDS
TAG_FIELDS, VARIANT_NOTE_FIELDS

class JSONUtilsTest(object):
databases = '__all__'
Expand Down
Loading

0 comments on commit 80f2037

Please sign in to comment.