Skip to content

Commit

Permalink
Remove duplicates from MongoDB bulk insertion chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
Andre Senna committed Jun 12, 2023
1 parent dce63fd commit fff53b6
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions das/canonical_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,15 @@ def _add_expression(self, expression, composite_type, toplevel, named_type, comp
self._flush_mongo_expressions()

def _mongo_insert_many(self, collection, bulk_insertion_raw):
bulk_insertion = [d for d in bulk_insertion_raw if ("name" not in d) or sys.getsizeof(d["name"]) < 16000000]
if len(bulk_insertion_raw) != len(bulk_insertion):
logger().error(f"Striped {len(bulk_insertion_raw) - len(bulk_insertion)} too large documents")
all_ids = set()
bulk_insertion_no_duplicates = []
for d in bulk_insertion_raw:
if d["_id"] not in all_ids:
all_ids.add(d["_id"])
bulk_insertion_no_duplicates.append(d)
bulk_insertion = [d for d in bulk_insertion_no_duplicates if ("name" not in d) or sys.getsizeof(d["name"]) < 16000000]
if len(bulk_insertion_no_duplicates) != len(bulk_insertion):
logger().error(f"Striped {len(bulk_insertion_no_duplicates) - len(bulk_insertion)} too large documents")
try:
collection.insert_many(bulk_insertion, ordered=False)
except Exception as e:
Expand Down

0 comments on commit fff53b6

Please sign in to comment.