From c1f809c7b3c787b5f885af66a9ce361db89daa4f Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 18:47:42 +0200 Subject: [PATCH 1/9] hotfix for the PresidioSentenceFaker process --- notebooks/1_Generate_data.ipynb | 726 ++++++++++++++++-- .../faker_extensions/sentences.py | 4 +- .../data_generator/presidio_sentence_faker.py | 86 ++- presidio_evaluator/data_objects.py | 19 +- tests/test_presidio_sentence_faker.py | 8 +- 5 files changed, 770 insertions(+), 73 deletions(-) diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index f685cc1..d5b59e8 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "is_executing": true }, @@ -29,9 +29,8 @@ "from pathlib import Path\n", "from typing import Dict, List\n", "\n", - "import numpy as np\n", "import pandas as pd\n", - "import tqdm\n", + "import numpy as np\n", "\n", "from presidio_evaluator import InputSample\n", "from presidio_evaluator.data_generator import PresidioSentenceFaker" @@ -74,7 +73,33 @@ "metadata": { "is_executing": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling: 100%|██████████| 10/10 [00:00<00:00, 12706.16it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Non-mapped entity type found: country Non-mapped entities will be mapped to COUNTRY in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n", + "Non-mapped entity type found: name Non-mapped entities will be mapped to NAME in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n", + "Non-mapped entity type found: address Non-mapped entities will be mapped to ADDRESS in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n", + "I just moved to {{city}} from {{COUNTRY}}\n", + "[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "sentence_templates = [\n", " \"My name is {{name}}\",\n", @@ -103,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "is_executing": true, "scrolled": true @@ -138,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -147,9 +172,223 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbergendernationalityprefixfirst_namemiddle_initiallast_namestreet_namecitystate_abbr...companydomain_namepersonnamefirst_name_femalefirst_name_maleprefix_femaleprefix_malelast_name_femalelast_name_male
01femaleCzechMrs.MarieJHamanováP.O. Box 255KangerlussuaqQE...Simple SolutionsMarathonDancing.glMarie HamanováMarie HamanováMarieMrs.Hamanová
12femaleFrenchMs.PatriciaGDesrosiersAvenida Noruega 42Vila RealVR...Formula GrayLostMillions.com.ptPatricia DesrosiersPatricia DesrosiersPatriciaMs.Desrosiers
23femaleAmericanMs.DebraONeal1659 Hoog StBrakpanGA...Dahlkemper'sMediumTube.co.zaDebra NealDebra NealDebraMs.Neal
34maleFrenchMr.PeverellCRacine183 Epimenidou StreetLimassolLI...QuickbizImproveLook.com.cyPeverell RacinePeverell RacinePeverellMr.Racine
45femaleSlovenianMrs.IolandaSTratnikKaru põik 61PärnuPR...Dubrow's CafeteriaPostTan.com.eeIolanda TratnikIolanda TratnikIolandaMrs.Tratnik
\n", + "

5 rows × 37 columns

\n", + "
" + ], + "text/plain": [ + " number gender nationality prefix first_name middle_initial last_name \\\n", + "0 1 female Czech Mrs. Marie J Hamanová \n", + "1 2 female French Ms. Patricia G Desrosiers \n", + "2 3 female American Ms. Debra O Neal \n", + "3 4 male French Mr. Peverell C Racine \n", + "4 5 female Slovenian Mrs. Iolanda S Tratnik \n", + "\n", + " street_name city state_abbr ... company \\\n", + "0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n", + "1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n", + "2 1659 Hoog St Brakpan GA ... Dahlkemper's \n", + "3 183 Epimenidou Street Limassol LI ... Quickbiz \n", + "4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n", + "\n", + " domain_name person name \\\n", + "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n", + "1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n", + "2 MediumTube.co.za Debra Neal Debra Neal \n", + "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n", + "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n", + "\n", + " first_name_female first_name_male prefix_female prefix_male \\\n", + "0 Marie Mrs. \n", + "1 Patricia Ms. \n", + "2 Debra Ms. \n", + "3 Peverell Mr. \n", + "4 Iolanda Mrs. \n", + "\n", + " last_name_female last_name_male \n", + "0 Hamanová \n", + "1 Desrosiers \n", + "2 Neal \n", + "3 Racine \n", + "4 Tratnik \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.DataFrame(sentence_faker._sentence_faker.records).head()" ] @@ -164,11 +403,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "is_executing": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from presidio_evaluator.data_generator.faker_extensions.providers import *\n", "\n", @@ -196,17 +446,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('name', 'person'),\n", + " ('credit_card_number', 'credit_card'),\n", + " ('date_of_birth', 'birthday')]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Create entity aliases (e.g. if your provider supports \"name\" but templates contain \"person\").\n", - "PresidioSentenceFaker.PROVIDER_ALIASES" + "provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES\n", + "provider_aliases\n", + "\n", + "# To customize, call `PresidioSentenceFaker(locale=\"en_US\",...,provider_aliases=provider_aliases)`" ] }, { @@ -222,9 +488,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 8521.17it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Full text: The bus station is on Via Pasquale Scura 127\n", + "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)\n", "pprint.pprint(fake_records[0])" @@ -239,12 +529,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "is_executing": true, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total: 1500\n", + "Avg # of records per template: 7.142857142857143\n", + "Median # of records per template: 7.0\n", + "Std: 2.4394713378441786\n" + ] + } + ], "source": [ "count_per_template_id = Counter([sample.template_id for sample in fake_records])\n", "\n", @@ -267,14 +568,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'PERSON': 895,\n", + " 'STREET_ADDRESS': 571,\n", + " 'GPE': 375,\n", + " 'ORGANIZATION': 277,\n", + " 'PHONE_NUMBER': 124,\n", + " 'CREDIT_CARD': 115,\n", + " 'DATE_TIME': 110,\n", + " 'AGE': 77,\n", + " 'TITLE': 71,\n", + " 'NRP': 67,\n", + " 'EMAIL_ADDRESS': 38,\n", + " 'DOMAIN_NAME': 31,\n", + " 'ZIP_CODE': 25,\n", + " 'IP_ADDRESS': 17,\n", + " 'US_SSN': 15,\n", + " 'IBAN_CODE': 12,\n", + " 'US_DRIVER_LICENSE': 4})" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "count_per_entity = Counter()\n", "for record in fake_records:\n", @@ -285,33 +613,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import dataclasses\n", - "def get_json(result) -> str:\n", - " spans_dict = json.dumps([dataclasses.asdict(span) for span in result.spans])\n", - " return dict(fake=result.fake, spans=spans_dict, template=result.template, template_id=result.template_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "is_executing": true - }, - "outputs": [], - "source": [ - "len(fake_records)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Full text: The bus station is on Via Pasquale Scura 127\n", + "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n", + "\n", + "Full text: Leigha C Mackay\\n\\nLa Sagne\\nSwitzerland\n", + "Spans: [Span(type: country, value: Switzerland, char_span: [29: 40]), Span(type: city, value: La Sagne, char_span: [19: 27]), Span(type: name, value: Leigha C Mackay, char_span: [0: 15])]\n", + "\n", + "Full text: Can someone call me on 06-82237745? I have some questions about opening an account.\n", + "Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]\n", + "\n", + "Full text: Could you please send me the last billed amount for cc 4218196001337 on my e-mail TomaszJablonski@gustr.com?\n", + "Spans: [Span(type: email, value: TomaszJablonski@gustr.com, char_span: [82: 107]), Span(type: credit_card_number, value: 4218196001337, char_span: [55: 68])]\n", + "\n", + "Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia\n", + "Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, value: 254 Damvergi Street, char_span: [41: 60]), Span(type: building_number, value: 083, char_span: [37: 40]), Span(type: first_name, value: Csanád, char_span: [17: 23]), Span(type: first_name_male, value: Csanád, char_span: [0: 6])]\n", + "\n", + "Full text: You can tell Cecilie was a huge Cecilie K Josefsen fan. Written when he was 21.\n", + "Spans: [Span(type: age, value: 21, char_span: [76: 78]), Span(type: person, value: cecilie k josefsen, char_span: [32: 50]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n", + "\n", + "Full text: Who's coming to Switzerland with me?\n", + "Spans: [Span(type: country, value: Switzerland, char_span: [16: 27])]\n", + "\n", + "Full text: Helena Carlsen\\n\\n637 Strojírenská 1006\\n Suite 026\\n Svratka\\n Czech Republic 45098\n", + "Spans: [Span(type: postcode, value: 45098, char_span: [79: 84]), Span(type: country, value: Czech Republic, char_span: [64: 78]), Span(type: city, value: Svratka, char_span: [54: 61]), Span(type: secondary_address, value: Suite 026, char_span: [42: 51]), Span(type: street_name, value: Strojírenská 1006, char_span: [22: 39]), Span(type: building_number, value: 637, char_span: [18: 21]), Span(type: person, value: Helena Carlsen, char_span: [0: 14])]\n", + "\n", + "Full text: Francesca Freeman\\n\\n35116 Rua Arapiraca 1943\\n Apt. 559\\n Teixeira de Freitas\\n Brazil 35172\\n(73) 4746-3459-Office\\,781-618-4959-Fax\n", + "Spans: [Span(type: phone_number, value: 781-618-4959, char_span: [118: 130]), Span(type: phone_number, value: (73) 4746-3459, char_span: [95: 109]), Span(type: postcode, value: 35172, char_span: [88: 93]), Span(type: country, value: Brazil, char_span: [81: 87]), Span(type: city, value: Teixeira de Freitas, char_span: [59: 78]), Span(type: secondary_address, value: Apt. 559, char_span: [48: 56]), Span(type: street_name, value: Rua Arapiraca 1943, char_span: [27: 45]), Span(type: building_number, value: 35116, char_span: [21: 26]), Span(type: person, value: Francesca Freeman, char_span: [0: 17])]\n", + "\n", + "Full text: 3... 2... 1... liftoff!\n", + "Spans: []\n", + "\n" + ] + } + ], "source": [ "for record in fake_records[:10]:\n", " print(record)" @@ -330,7 +671,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "is_executing": true, "pycharm": { @@ -344,9 +685,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'../data/generated_size_1500_date_January_06_2025.json'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "output_file" ] @@ -364,30 +716,290 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1500/1500 [00:00<00:00, 35869.80it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textpostagtemplate_idlabelsentence
0TheDETDT163O0
1busNOUNNN163O0
2stationNOUNNN163O0
3isAUXVBZ163O0
4onADPIN163O0
5ViaPROPNNNP163B-street_name0
6PasqualePROPNNNP163I-street_name0
7ScuraPROPNNNP163I-street_name0
8127NUMCD163I-street_name0
9LeighaVERBVB189B-name1
10CPROPNNNP189I-name1
11Mackay\\n\\nLaNOUNNN189I-name1
12Sagne\\nSwitzerlandPROPNNNP189B-city1
13CanAUXMD57O2
14someonePRONNN57O2
15callVERBVB57O2
16mePRONPRP57O2
17onADPIN57O2
1806NUMCD57B-phone_number2
19-SYMSYM57I-phone_number2
\n", + "
" + ], + "text/plain": [ + " text pos tag template_id label sentence\n", + "0 The DET DT 163 O 0\n", + "1 bus NOUN NN 163 O 0\n", + "2 station NOUN NN 163 O 0\n", + "3 is AUX VBZ 163 O 0\n", + "4 on ADP IN 163 O 0\n", + "5 Via PROPN NNP 163 B-street_name 0\n", + "6 Pasquale PROPN NNP 163 I-street_name 0\n", + "7 Scura PROPN NNP 163 I-street_name 0\n", + "8 127 NUM CD 163 I-street_name 0\n", + "9 Leigha VERB VB 189 B-name 1\n", + "10 C PROPN NNP 189 I-name 1\n", + "11 Mackay\\n\\nLa NOUN NN 189 I-name 1\n", + "12 Sagne\\nSwitzerland PROPN NNP 189 B-city 1\n", + "13 Can AUX MD 57 O 2\n", + "14 someone PRON NN 57 O 2\n", + "15 call VERB VB 57 O 2\n", + "16 me PRON PRP 57 O 2\n", + "17 on ADP IN 57 O 2\n", + "18 06 NUM CD 57 B-phone_number 2\n", + "19 - SYM SYM 57 I-phone_number 2" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "conll = InputSample.create_conll_dataset(fake_records)" + "conll = InputSample.create_conll_dataset(dataset=fake_records)\n", + "conll.head(20)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_06_2025.tsv\n" + ] + } + ], "source": [ - "conll.to_csv(output_conll, sep=\"\\t\")" + "conll.to_csv(output_conll, sep=\"\\t\")\n", + "print(f\"CoNLL2003 dataset structure output location: {output_conll}\")" ] }, { @@ -417,9 +1029,9 @@ ], "metadata": { "kernelspec": { - "display_name": "presidio", + "display_name": "presidio-research", "language": "python", - "name": "python3" + "name": "presidio_research" }, "language_info": { "codemirror_mode": { @@ -431,7 +1043,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py index fee59bc..63753b2 100644 --- a/presidio_evaluator/data_generator/faker_extensions/sentences.py +++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py @@ -196,7 +196,9 @@ def parse( except Exception as err: raise AttributeError( f'Failed to generate fake data based on template "{template}".' - f"You might need to add a new Faker provider! " + f"You might need to add a new Faker provider " + f"or create an alias (map the entity name to one " + f"of the existing providers)." f"{err}" ) diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py index 7a85cae..ec83332 100644 --- a/presidio_evaluator/data_generator/presidio_sentence_faker.py +++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py @@ -1,7 +1,8 @@ import json import random from pathlib import Path -from typing import List, Optional, Union, Dict +from typing import List, Optional, Tuple, Union, Dict +import re import numpy as np import pandas as pd @@ -52,12 +53,17 @@ class PresidioSentenceFaker: :param: entity_providers: Defaults to presidio_additional_entity_providers, a provided argument overrides this :param: base_records: A DataFrame with entity types as columns and each row corresponding to a fake individual. Defaults to presidio_evaluator.data_generator.faker_extensions.datasets.load_fake_person_df() + :param: entity_type_mapping: A dictionary mapping entity types to Presidio entity types + :param: provider_aliases: A dictionary mapping provider names to the given entity types. + Useful if the templates contain a different name for the entity type than the one supported by Faker or PresidioSentenceFaker. :param: random_seed: A seed to make results reproducible between runs """ - PROVIDER_ALIASES = dict( - name="person", credit_card_number="credit_card", date_of_birth="birthday" - ) + PROVIDER_ALIASES = [ + ("name", "person"), + ("credit_card_number", "credit_card"), + ("date_of_birth", "birthday"), + ] ENTITY_TYPE_MAPPING = dict( person="PERSON", ip_address="IP_ADDRESS", @@ -111,6 +117,8 @@ def __init__( sentence_templates: Optional[List[str]] = None, entity_providers: Optional[List[BaseProvider]] = None, base_records: Optional[Union[pd.DataFrame, List[Dict]]] = None, + entity_type_mapping: Optional[Dict[str, str]] = None, + provider_aliases: Optional[List[Tuple[str, str]]] = None, random_seed: Optional[SeedType] = None, ): self._sentence_templates = sentence_templates @@ -120,6 +128,7 @@ def __init__( for line in presidio_templates_file_path.read_text().splitlines() ] if entity_providers is None: + print("Using default entity providers") entity_providers = presidio_additional_entity_providers if base_records is None: base_records = load_fake_person_df() @@ -131,33 +140,94 @@ def __init__( self._sentence_faker.add_provider(entity_provider) self.seed(random_seed) - for provider, alias in self.PROVIDER_ALIASES.items(): + + if not entity_type_mapping: + print( + "Using default entity mapping between the entities \ + in the templates and the ones in the output dataset" + ) + entity_type_mapping = self.ENTITY_TYPE_MAPPING + + self._entity_type_mapping = entity_type_mapping + + if not provider_aliases: + print("Using default provider aliases") + provider_aliases = self.PROVIDER_ALIASES + + for provider, alias in provider_aliases: self._sentence_faker.add_provider_alias( provider_name=provider, new_name=alias ) self.fake_sentence_results = None def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]: + """Generate fake sentences based on the templates, input data and entity providers.""" self.fake_sentence_results = [] # Map faker generated entity types to Presidio entity types for _ in tqdm(range(num_samples), desc="Sampling"): template_id = random.choice(range(len(self._sentence_templates))) template = self._sentence_templates[template_id] + template = self._preprocess_template(template) fake_sentence_result = self._sentence_faker.parse(template, template_id) for span in fake_sentence_result.spans: - span.type = self.ENTITY_TYPE_MAPPING[span.type] - for key, value in self.ENTITY_TYPE_MAPPING.items(): + if span.type in self._entity_type_mapping.keys(): + # Use the mapped entity type if exists + span.type = self._entity_type_mapping[span.type] + else: + # Otherwise, capitalize the entity type and add to the mapping + print( + f"Warning: Non-mapped entity type found: {span.type} " + f"Non-mapped entities will be mapped to {span.type.upper()} " + f"in the output dataset. If you prefer a different mapping, " + f"pass the `entity_type_mapping` argument with a mapping for this entity type." + ) + self._entity_type_mapping[span.type] = span.type.upper() + for key, value in self._entity_type_mapping.items(): fake_sentence_result.masked = fake_sentence_result.masked.replace( "{{%s}}" % key, "{{%s}}" % value ) self.fake_sentence_results.append(fake_sentence_result) return self.fake_sentence_results - def seed(self, seed_value=42): + @staticmethod + def seed(self, seed_value=42) -> None: + """Seed the faker and random modules for reproducibility.""" Faker.seed(seed_value) random.seed(seed_value) np.random.seed(seed_value) + def add_provider_alias(self, provider_name: str, new_name: str) -> None: + """ + Adds a copy of a provider, with a different name + :param provider_name: Name of original provider + :param new_name: New name + :example: + >>>self.add_provider_alias(provider_name="name", new_name="person") + >>>self.person() + """ + self._sentence_faker.add_provider_alias( + provider_name=provider_name, new_name=new_name + ) + + def add_entity_type_mapping( + self, input_entity_type: str, output_entity_type: str + ) -> None: + self._entity_type_mapping[input_entity_type] = output_entity_type + + @staticmethod + def _preprocess_template(template: str): + """Lowercase the entity names within double curly braces in the template, and replace < and > with {{ and }}.""" # noqa: E501 + + def lowercase_within_braces(s): + return re.sub( + r"{{(.*?)}}", lambda match: f"{{{{{match.group(1).lower()}}}}}", s + ) + + template = template.replace("<", "{{").replace(">", "}}") + template = lowercase_within_braces(template) + + return template + if __name__ == "__main__": sentence_faker = PresidioSentenceFaker( diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py index 9e52601..faaa944 100644 --- a/presidio_evaluator/data_objects.py +++ b/presidio_evaluator/data_objects.py @@ -180,6 +180,12 @@ def from_json(cls, data, **kwargs): return cls(**data, create_tags_from_span=True, **kwargs) def get_tags(self, scheme="IOB", model_version="en_core_web_sm"): + """Extract the tokens and tags from the spans. + + :param scheme: IO, BIO or BILUO + :param model_version: The name of the spaCy model to use for tokenization + """ + start_indices = [span.start_position for span in self.spans] end_indices = [span.end_position for span in self.spans] tags = [span.entity_type for span in self.spans] @@ -192,19 +198,25 @@ def get_tags(self, scheme="IOB", model_version="en_core_web_sm"): starts=start_indices, ends=end_indices, tokens=tokens, + token_model_version=model_version ) return tokens, labels - def to_conll(self, translate_tags: bool) -> List[Dict[str, Any]]: + def to_conll(self, translate_tags: bool, tokenizer: str) -> List[Dict[str, Any]]: """ Turns a list of InputSample objects to a dictionary containing text, pos, tag, template_id and label. :param translate_tags: Whether to translate tags using the PRESIDIO_SPACY_ENTITIES dictionary + :param tokenizer: The name of the spaCy model to use for tokenization :return: Dict """ conll = [] + + if len(self.tokens) == 0: + self.tokens, self.tags = self.get_tags(model_version=tokenizer) + for i, token in enumerate(self.tokens): if translate_tags: label = self.translate_tag( @@ -233,7 +245,7 @@ def create_conll_dataset( dataset: List["InputSample"], translate_tags=False, to_bio=True, - token_model_version="en_core_web_sm", + tokenizer:str="en_core_web_sm", ) -> pd.DataFrame: if len(dataset) <= 1: raise ValueError("Dataset should contain multiple records") @@ -243,7 +255,8 @@ def create_conll_dataset( for sample in tqdm(dataset): if to_bio: sample.biluo_to_bio() - conll = sample.to_conll(translate_tags=translate_tags) + conll = sample.to_conll(translate_tags=translate_tags, + tokenizer=tokenizer) for token in conll: token["sentence"] = i conlls.append(token) diff --git a/tests/test_presidio_sentence_faker.py b/tests/test_presidio_sentence_faker.py index 206d5fd..7eb699b 100644 --- a/tests/test_presidio_sentence_faker.py +++ b/tests/test_presidio_sentence_faker.py @@ -24,13 +24,13 @@ def test_generate_new_fake_sentences(num_sentences: int): expected_providers = deepcopy(default_faker_providers) expected_providers.extend(presidio_providers) - expected_providers.extend([standard_faker.__getattr__(key) - for key in PresidioSentenceFaker.PROVIDER_ALIASES.keys()]) + expected_providers.extend([standard_faker.__getattr__(alias[0]) + for alias in PresidioSentenceFaker.PROVIDER_ALIASES]) actual_providers = sentence_faker._sentence_faker.providers num_aliases = len(PresidioSentenceFaker.PROVIDER_ALIASES) actual_num_providers = len(actual_providers) - expected_aliases = set(getattr(standard_faker, provider_name) - for provider_name in PresidioSentenceFaker.PROVIDER_ALIASES.keys()) + expected_aliases = set(getattr(standard_faker, provider_name[0]) + for provider_name in PresidioSentenceFaker.PROVIDER_ALIASES) assert actual_num_providers == len(expected_providers), \ f'Expected {len(presidio_providers)} presidio providers to be used and {num_aliases} aliases. ' \ f'Faker has been extended with {actual_num_providers - len(default_faker_providers)} providers/aliases. ' \ From aa41f10be324e124f7b6acc383813dca33d11ae4 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 18:48:13 +0200 Subject: [PATCH 2/9] updated package version to 0.2.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 99cd13a..eca721e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "presidio_evaluator" -version = "0.2.0" +version = "0.2.1" description = "" authors = ["Microsoft"] readme = "README.md" From 5443db19d460854aaf86d0539947b3e0223d4473 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 18:51:58 +0200 Subject: [PATCH 3/9] removed CI for py3.8 and added 3.12 --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 71e23b2..eaf304c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,14 +8,14 @@ pool: vmImage: 'ubuntu-latest' strategy: matrix: - Python38: - python.version: '3.8' Python39: python.version: '3.9' Python310: python.version: '3.10' Python311: python.version: '3.11' + Python312: + python.version: '3.12' steps: - task: UsePythonVersion@0 inputs: From 3bdede71b157b446c51b9a1716452828784a199d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 22:24:37 +0200 Subject: [PATCH 4/9] small changes to data generation notebook --- notebooks/1_Generate_data.ipynb | 278 +++++++----------- .../data_generator/presidio_sentence_faker.py | 13 +- 2 files changed, 120 insertions(+), 171 deletions(-) diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index d5b59e8..fe291ff 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -16,12 +16,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "is_executing": true, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.9/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n" + ] + } + ], "source": [ "import datetime\n", "import pprint\n", @@ -69,26 +78,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "is_executing": true }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using default entity providers\n", + "Using default entity mapping between the entities in the templates and the ones in the output dataset\n", + "Using default provider aliases\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Sampling: 100%|██████████| 10/10 [00:00<00:00, 12706.16it/s]" + "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3407.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Non-mapped entity type found: country Non-mapped entities will be mapped to COUNTRY in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n", - "Non-mapped entity type found: name Non-mapped entities will be mapped to NAME in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n", - "Non-mapped entity type found: address Non-mapped entities will be mapped to ADDRESS in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n", - "I just moved to {{city}} from {{COUNTRY}}\n", + "I just moved to {{GPE}} from {{GPE}}\n", "[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]\n" ] }, @@ -108,7 +123,9 @@ "]\n", "\n", "\n", - "sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05, sentence_templates=sentence_templates)\n", + "sentence_faker = PresidioSentenceFaker('en_US', \n", + " lower_case_ratio=0.05, \n", + " sentence_templates=sentence_templates)\n", "fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)\n", "\n", "# Print the spans of the first sample\n", @@ -128,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": { "is_executing": true, "scrolled": true @@ -163,16 +180,26 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using default entity providers\n", + "Using default entity mapping between the entities in the templates and the ones in the output dataset\n", + "Using default provider aliases\n" + ] + } + ], "source": [ "sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -259,8 +286,8 @@ " ...\n", " Formula Gray\n", " LostMillions.com.pt\n", - " Patricia Desrosiers\n", - " Patricia Desrosiers\n", + " Patricia G. Desrosiers\n", + " Patricia G. Desrosiers\n", " Patricia\n", " \n", " Ms.\n", @@ -360,12 +387,12 @@ "3 183 Epimenidou Street Limassol LI ... Quickbiz \n", "4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n", "\n", - " domain_name person name \\\n", - "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n", - "1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n", - "2 MediumTube.co.za Debra Neal Debra Neal \n", - "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n", - "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n", + " domain_name person name \\\n", + "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n", + "1 LostMillions.com.pt Patricia G. Desrosiers Patricia G. Desrosiers \n", + "2 MediumTube.co.za Debra Neal Debra Neal \n", + "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n", + "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n", "\n", " first_name_female first_name_male prefix_female prefix_male \\\n", "0 Marie Mrs. \n", @@ -384,7 +411,7 @@ "[5 rows x 37 columns]" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -398,12 +425,38 @@ "metadata": {}, "source": [ "`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.\n", - "These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`" + "These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`\n", + "\n", + "It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, \n", + "and calling `add_provider` on the `PresidioSentenceFaker` instance.\n", + "For example:" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "from faker.providers import BaseProvider\n", + "\n", + "class MarsIdProvider(BaseProvider):\n", + " def mars_id(self):\n", + " # Generate a random row number between 1 and 50\n", + " row = random.randint(1, 50)\n", + " # Generate a random letter for the seat location from A-K\n", + " location = random.choice('ABCDEFGHIJK')\n", + " # Return the seat in the format \"row-letter\" (e.g., \"25A\")\n", + " return f\"{row}{location}\"\n", + "\n", + "sentence_faker.add_provider(MarsIdProvider)\n", + "# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": { "is_executing": true }, @@ -414,7 +467,7 @@ "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider" ] }, - "execution_count": 7, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -446,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": { "is_executing": true, "pycharm": { @@ -462,7 +515,7 @@ " ('date_of_birth', 'birthday')]" ] }, - "execution_count": 8, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -488,14 +541,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 8521.17it/s]" + "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 7794.60it/s]" ] }, { @@ -529,7 +582,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": { "is_executing": true, "scrolled": true @@ -568,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": { "is_executing": true, "pycharm": { @@ -598,7 +651,7 @@ " 'US_DRIVER_LICENSE': 4})" ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -613,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -623,8 +676,8 @@ "Full text: The bus station is on Via Pasquale Scura 127\n", "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n", "\n", - "Full text: Leigha C Mackay\\n\\nLa Sagne\\nSwitzerland\n", - "Spans: [Span(type: country, value: Switzerland, char_span: [29: 40]), Span(type: city, value: La Sagne, char_span: [19: 27]), Span(type: name, value: Leigha C Mackay, char_span: [0: 15])]\n", + "Full text: Leigha Mackay\\n\\nLa Sagne\\nSwitzerland\n", + "Spans: [Span(type: country, value: Switzerland, char_span: [27: 38]), Span(type: city, value: La Sagne, char_span: [17: 25]), Span(type: name, value: Leigha Mackay, char_span: [0: 13])]\n", "\n", "Full text: Can someone call me on 06-82237745? I have some questions about opening an account.\n", "Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]\n", @@ -635,8 +688,8 @@ "Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia\n", "Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, value: 254 Damvergi Street, char_span: [41: 60]), Span(type: building_number, value: 083, char_span: [37: 40]), Span(type: first_name, value: Csanád, char_span: [17: 23]), Span(type: first_name_male, value: Csanád, char_span: [0: 6])]\n", "\n", - "Full text: You can tell Cecilie was a huge Cecilie K Josefsen fan. Written when he was 21.\n", - "Spans: [Span(type: age, value: 21, char_span: [76: 78]), Span(type: person, value: cecilie k josefsen, char_span: [32: 50]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n", + "Full text: You can tell Cecilie was a huge Cecilie Josefsen fan. Written when he was 21.\n", + "Spans: [Span(type: age, value: 21, char_span: [74: 76]), Span(type: person, value: cecilie josefsen, char_span: [32: 48]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n", "\n", "Full text: Who's coming to Switzerland with me?\n", "Spans: [Span(type: country, value: Switzerland, char_span: [16: 27])]\n", @@ -671,7 +724,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": { "is_executing": true, "pycharm": { @@ -685,20 +738,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'../data/generated_size_1500_date_January_06_2025.json'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "output_file" ] @@ -716,7 +758,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 23, "metadata": { "is_executing": true, "pycharm": { @@ -728,7 +770,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1500/1500 [00:00<00:00, 35869.80it/s]\n" + "100%|██████████| 1500/1500 [00:00<00:00, 47248.41it/s]\n" ] }, { @@ -851,137 +893,37 @@ " B-name\n", " 1\n", " \n", - " \n", - " 10\n", - " C\n", - " PROPN\n", - " NNP\n", - " 189\n", - " I-name\n", - " 1\n", - " \n", - " \n", - " 11\n", - " Mackay\\n\\nLa\n", - " NOUN\n", - " NN\n", - " 189\n", - " I-name\n", - " 1\n", - " \n", - " \n", - " 12\n", - " Sagne\\nSwitzerland\n", - " PROPN\n", - " NNP\n", - " 189\n", - " B-city\n", - " 1\n", - " \n", - " \n", - " 13\n", - " Can\n", - " AUX\n", - " MD\n", - " 57\n", - " O\n", - " 2\n", - " \n", - " \n", - " 14\n", - " someone\n", - " PRON\n", - " NN\n", - " 57\n", - " O\n", - " 2\n", - " \n", - " \n", - " 15\n", - " call\n", - " VERB\n", - " VB\n", - " 57\n", - " O\n", - " 2\n", - " \n", - " \n", - " 16\n", - " me\n", - " PRON\n", - " PRP\n", - " 57\n", - " O\n", - " 2\n", - " \n", - " \n", - " 17\n", - " on\n", - " ADP\n", - " IN\n", - " 57\n", - " O\n", - " 2\n", - " \n", - " \n", - " 18\n", - " 06\n", - " NUM\n", - " CD\n", - " 57\n", - " B-phone_number\n", - " 2\n", - " \n", - " \n", - " 19\n", - " -\n", - " SYM\n", - " SYM\n", - " 57\n", - " I-phone_number\n", - " 2\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " text pos tag template_id label sentence\n", - "0 The DET DT 163 O 0\n", - "1 bus NOUN NN 163 O 0\n", - "2 station NOUN NN 163 O 0\n", - "3 is AUX VBZ 163 O 0\n", - "4 on ADP IN 163 O 0\n", - "5 Via PROPN NNP 163 B-street_name 0\n", - "6 Pasquale PROPN NNP 163 I-street_name 0\n", - "7 Scura PROPN NNP 163 I-street_name 0\n", - "8 127 NUM CD 163 I-street_name 0\n", - "9 Leigha VERB VB 189 B-name 1\n", - "10 C PROPN NNP 189 I-name 1\n", - "11 Mackay\\n\\nLa NOUN NN 189 I-name 1\n", - "12 Sagne\\nSwitzerland PROPN NNP 189 B-city 1\n", - "13 Can AUX MD 57 O 2\n", - "14 someone PRON NN 57 O 2\n", - "15 call VERB VB 57 O 2\n", - "16 me PRON PRP 57 O 2\n", - "17 on ADP IN 57 O 2\n", - "18 06 NUM CD 57 B-phone_number 2\n", - "19 - SYM SYM 57 I-phone_number 2" + " text pos tag template_id label sentence\n", + "0 The DET DT 163 O 0\n", + "1 bus NOUN NN 163 O 0\n", + "2 station NOUN NN 163 O 0\n", + "3 is AUX VBZ 163 O 0\n", + "4 on ADP IN 163 O 0\n", + "5 Via PROPN NNP 163 B-street_name 0\n", + "6 Pasquale PROPN NNP 163 I-street_name 0\n", + "7 Scura PROPN NNP 163 I-street_name 0\n", + "8 127 NUM CD 163 I-street_name 0\n", + "9 Leigha VERB VB 189 B-name 1" ] }, - "execution_count": 17, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conll = InputSample.create_conll_dataset(dataset=fake_records)\n", - "conll.head(20)" + "conll.head(10)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 24, "metadata": { "is_executing": true, "pycharm": { @@ -1008,7 +950,7 @@ "source": [ "### Next steps\n", "\n", - "- Evaluate Presidio using this fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n", + "- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n", "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)\n", "- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)" ] diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py index ec83332..f5355e0 100644 --- a/presidio_evaluator/data_generator/presidio_sentence_faker.py +++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py @@ -143,8 +143,8 @@ def __init__( if not entity_type_mapping: print( - "Using default entity mapping between the entities \ - in the templates and the ones in the output dataset" + "Using default entity mapping between the entities " + "in the templates and the ones in the output dataset" ) entity_type_mapping = self.ENTITY_TYPE_MAPPING @@ -176,7 +176,7 @@ def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]: else: # Otherwise, capitalize the entity type and add to the mapping print( - f"Warning: Non-mapped entity type found: {span.type} " + f"Warning: Non-mapped entity type found: {span.type}. " f"Non-mapped entities will be mapped to {span.type.upper()} " f"in the output dataset. If you prefer a different mapping, " f"pass the `entity_type_mapping` argument with a mapping for this entity type." @@ -196,6 +196,13 @@ def seed(self, seed_value=42) -> None: random.seed(seed_value) np.random.seed(seed_value) + def add_provider(self, provider:BaseProvider) ->None: + """ + Add a provider to the sentence faker + :param provider: A faker provider inheriting from BaseProvider + """ + self._sentence_faker.add_provider(provider) + def add_provider_alias(self, provider_name: str, new_name: str) -> None: """ Adds a copy of a provider, with a different name From 21cc2292d36507494c08db1022f87ccc374181ed Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 22:33:08 +0200 Subject: [PATCH 5/9] minor updates --- .../data_generator/faker_extensions/sentences.py | 8 +++----- .../data_generator/presidio_sentence_faker.py | 2 +- presidio_evaluator/data_objects.py | 8 ++++++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py index 63753b2..c490f91 100644 --- a/presidio_evaluator/data_generator/faker_extensions/sentences.py +++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py @@ -195,11 +195,9 @@ def parse( return fake_pattern except Exception as err: raise AttributeError( - f'Failed to generate fake data based on template "{template}".' - f"You might need to add a new Faker provider " - f"or create an alias (map the entity name to one " - f"of the existing providers)." - f"{err}" + f'Failed to generate fake data based on template "{template}". ' + f'Add a new Faker provider or create an alias ' + f'for the entity name. {err}' ) @staticmethod diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py index f5355e0..009cb37 100644 --- a/presidio_evaluator/data_generator/presidio_sentence_faker.py +++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py @@ -190,7 +190,7 @@ def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]: return self.fake_sentence_results @staticmethod - def seed(self, seed_value=42) -> None: + def seed(seed_value=42) -> None: """Seed the faker and random modules for reproducibility.""" Faker.seed(seed_value) random.seed(seed_value) diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py index faaa944..33fb842 100644 --- a/presidio_evaluator/data_objects.py +++ b/presidio_evaluator/data_objects.py @@ -179,7 +179,9 @@ def from_json(cls, data, **kwargs): data["spans"] = [Span.from_json(span) for span in data["spans"]] return cls(**data, create_tags_from_span=True, **kwargs) - def get_tags(self, scheme="IOB", model_version="en_core_web_sm"): + def get_tags(self, + scheme:str="IOB", + model_version:str="en_core_web_sm"): """Extract the tokens and tags from the spans. :param scheme: IO, BIO or BILUO @@ -203,7 +205,9 @@ def get_tags(self, scheme="IOB", model_version="en_core_web_sm"): return tokens, labels - def to_conll(self, translate_tags: bool, tokenizer: str) -> List[Dict[str, Any]]: + def to_conll(self, + translate_tags: bool, + tokenizer: str="en_core_web_sm") -> List[Dict[str, Any]]: """ Turns a list of InputSample objects to a dictionary containing text, pos, tag, template_id and label. From b57cc70f446ab99a1fba726a71e911902f755c1c Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 22:35:32 +0200 Subject: [PATCH 6/9] formatting --- .../faker_extensions/sentences.py | 4 ++-- .../data_generator/presidio_sentence_faker.py | 2 +- presidio_evaluator/data_objects.py | 17 +++++++---------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py index c490f91..087052c 100644 --- a/presidio_evaluator/data_generator/faker_extensions/sentences.py +++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py @@ -196,8 +196,8 @@ def parse( except Exception as err: raise AttributeError( f'Failed to generate fake data based on template "{template}". ' - f'Add a new Faker provider or create an alias ' - f'for the entity name. {err}' + f"Add a new Faker provider or create an alias " + f"for the entity name. {err}" ) @staticmethod diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py index 009cb37..abc3205 100644 --- a/presidio_evaluator/data_generator/presidio_sentence_faker.py +++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py @@ -196,7 +196,7 @@ def seed(seed_value=42) -> None: random.seed(seed_value) np.random.seed(seed_value) - def add_provider(self, provider:BaseProvider) ->None: + def add_provider(self, provider: BaseProvider) -> None: """ Add a provider to the sentence faker :param provider: A faker provider inheriting from BaseProvider diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py index 33fb842..0dd7d2f 100644 --- a/presidio_evaluator/data_objects.py +++ b/presidio_evaluator/data_objects.py @@ -179,9 +179,7 @@ def from_json(cls, data, **kwargs): data["spans"] = [Span.from_json(span) for span in data["spans"]] return cls(**data, create_tags_from_span=True, **kwargs) - def get_tags(self, - scheme:str="IOB", - model_version:str="en_core_web_sm"): + def get_tags(self, scheme: str = "IOB", model_version: str = "en_core_web_sm"): """Extract the tokens and tags from the spans. :param scheme: IO, BIO or BILUO @@ -200,14 +198,14 @@ def get_tags(self, starts=start_indices, ends=end_indices, tokens=tokens, - token_model_version=model_version + token_model_version=model_version, ) return tokens, labels - def to_conll(self, - translate_tags: bool, - tokenizer: str="en_core_web_sm") -> List[Dict[str, Any]]: + def to_conll( + self, translate_tags: bool, tokenizer: str = "en_core_web_sm" + ) -> List[Dict[str, Any]]: """ Turns a list of InputSample objects to a dictionary containing text, pos, tag, template_id and label. @@ -249,7 +247,7 @@ def create_conll_dataset( dataset: List["InputSample"], translate_tags=False, to_bio=True, - tokenizer:str="en_core_web_sm", + tokenizer: str = "en_core_web_sm", ) -> pd.DataFrame: if len(dataset) <= 1: raise ValueError("Dataset should contain multiple records") @@ -259,8 +257,7 @@ def create_conll_dataset( for sample in tqdm(dataset): if to_bio: sample.biluo_to_bio() - conll = sample.to_conll(translate_tags=translate_tags, - tokenizer=tokenizer) + conll = sample.to_conll(translate_tags=translate_tags, tokenizer=tokenizer) for token in conll: token["sentence"] = i conlls.append(token) From f18d5f4c38cea6345aa403000527a87772b7d0c1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 22:36:38 +0200 Subject: [PATCH 7/9] removed warning --- notebooks/1_Generate_data.ipynb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index fe291ff..d6c7596 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -25,10 +25,7 @@ { "name": "stderr", "output_type": "stream", - "text": [ - "/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.9/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", - " warnings.warn(\n" - ] + "text": [] } ], "source": [ From d8a7de39150bad00114195c2e9383556a4a7c89b Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 22:44:24 +0200 Subject: [PATCH 8/9] added tests to template preprocessing --- tests/test_presidio_sentence_faker.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_presidio_sentence_faker.py b/tests/test_presidio_sentence_faker.py index 7eb699b..34d2d9e 100644 --- a/tests/test_presidio_sentence_faker.py +++ b/tests/test_presidio_sentence_faker.py @@ -43,3 +43,14 @@ def test_generate_new_fake_sentences(num_sentences: int): assert fake_sentence_result.full_text assert fake_sentence_result.masked assert fake_sentence_result.template_id >= 0 + + +@pytest.mark.parametrize("template_before, template_after", [ + ("I just moved to {{CiTY}} from {{Country}}", + "I just moved to {{city}} from {{country}}"), + ("I just moved to from .", + "I just moved to {{city}} from {{country}}.") +]) +def test_preprocess_template(template_before: str, template_after: str): + sentence_faker = PresidioSentenceFaker(locale='en', lower_case_ratio=0) + assert sentence_faker._preprocess_template(template_before) == template_after From 58f7f28672d731aa53c69c6e22bf8f7300b1593b Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 22:47:27 +0200 Subject: [PATCH 9/9] re-run notebook --- notebooks/1_Generate_data.ipynb | 305 ++++++++++++++++++-------------- 1 file changed, 169 insertions(+), 136 deletions(-) diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index d6c7596..78da83c 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -16,18 +16,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "is_executing": true, "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [] - } - ], + "outputs": [], "source": [ "import datetime\n", "import pprint\n", @@ -85,7 +79,7 @@ "output_type": "stream", "text": [ "Using default entity providers\n", - "Using default entity mapping between the entities in the templates and the ones in the output dataset\n", + "Using default entity mapping between the entities in the templates and the ones in the output dataset\n", "Using default provider aliases\n" ] }, @@ -93,15 +87,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3407.23it/s]" + "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3959.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "I just moved to {{GPE}} from {{GPE}}\n", - "[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]\n" + "Please send it to {{STREET_ADDRESS}}\n", + "[Span(type: address, value: the corner of Καλαμπάκα 33 and Stefan Land, char_span: [18: 60])]\n" ] }, { @@ -142,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": { "is_executing": true, "scrolled": true @@ -177,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -185,7 +179,7 @@ "output_type": "stream", "text": [ "Using default entity providers\n", - "Using default entity mapping between the entities in the templates and the ones in the output dataset\n", + "Using default entity mapping between the entities in the templates and the ones in the output dataset\n", "Using default provider aliases\n" ] } @@ -196,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -307,8 +301,8 @@ " ...\n", " Dahlkemper's\n", " MediumTube.co.za\n", - " Debra Neal\n", - " Debra Neal\n", + " Debra O. Neal\n", + " Debra O. Neal\n", " Debra\n", " \n", " Ms.\n", @@ -331,8 +325,8 @@ " ...\n", " Quickbiz\n", " ImproveLook.com.cy\n", - " Peverell Racine\n", - " Peverell Racine\n", + " Peverell C. Racine\n", + " Peverell C. Racine\n", " \n", " Peverell\n", " \n", @@ -355,8 +349,8 @@ " ...\n", " Dubrow's Cafeteria\n", " PostTan.com.ee\n", - " Iolanda Tratnik\n", - " Iolanda Tratnik\n", + " Iolanda S. Tratnik\n", + " Iolanda S. Tratnik\n", " Iolanda\n", " \n", " Mrs.\n", @@ -387,9 +381,9 @@ " domain_name person name \\\n", "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n", "1 LostMillions.com.pt Patricia G. Desrosiers Patricia G. Desrosiers \n", - "2 MediumTube.co.za Debra Neal Debra Neal \n", - "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n", - "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n", + "2 MediumTube.co.za Debra O. Neal Debra O. Neal \n", + "3 ImproveLook.com.cy Peverell C. Racine Peverell C. Racine \n", + "4 PostTan.com.ee Iolanda S. Tratnik Iolanda S. Tratnik \n", "\n", " first_name_female first_name_male prefix_female prefix_male \\\n", "0 Marie Mrs. \n", @@ -408,7 +402,7 @@ "[5 rows x 37 columns]" ] }, - "execution_count": 10, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -431,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -453,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": { "is_executing": true }, @@ -464,7 +458,7 @@ "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider" ] }, - "execution_count": 14, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -496,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "metadata": { "is_executing": true, "pycharm": { @@ -512,7 +506,7 @@ " ('date_of_birth', 'birthday')]" ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -538,22 +532,22 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 7794.60it/s]" + "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 13821.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Full text: The bus station is on Via Pasquale Scura 127\n", - "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n", + "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n", + "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n", "\n" ] }, @@ -579,7 +573,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": { "is_executing": true, "scrolled": true @@ -592,7 +586,7 @@ "Total: 1500\n", "Avg # of records per template: 7.142857142857143\n", "Median # of records per template: 7.0\n", - "Std: 2.4394713378441786\n" + "Std: 2.6812526263406258\n" ] } ], @@ -618,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "metadata": { "is_executing": true, "pycharm": { @@ -629,26 +623,26 @@ { "data": { "text/plain": [ - "Counter({'PERSON': 895,\n", - " 'STREET_ADDRESS': 571,\n", - " 'GPE': 375,\n", - " 'ORGANIZATION': 277,\n", - " 'PHONE_NUMBER': 124,\n", - " 'CREDIT_CARD': 115,\n", - " 'DATE_TIME': 110,\n", - " 'AGE': 77,\n", - " 'TITLE': 71,\n", - " 'NRP': 67,\n", - " 'EMAIL_ADDRESS': 38,\n", - " 'DOMAIN_NAME': 31,\n", - " 'ZIP_CODE': 25,\n", - " 'IP_ADDRESS': 17,\n", - " 'US_SSN': 15,\n", - " 'IBAN_CODE': 12,\n", - " 'US_DRIVER_LICENSE': 4})" + "Counter({'PERSON': 874,\n", + " 'STREET_ADDRESS': 609,\n", + " 'GPE': 442,\n", + " 'ORGANIZATION': 253,\n", + " 'CREDIT_CARD': 131,\n", + " 'PHONE_NUMBER': 117,\n", + " 'DATE_TIME': 106,\n", + " 'TITLE': 91,\n", + " 'AGE': 79,\n", + " 'NRP': 66,\n", + " 'ZIP_CODE': 42,\n", + " 'EMAIL_ADDRESS': 33,\n", + " 'DOMAIN_NAME': 30,\n", + " 'IBAN_CODE': 26,\n", + " 'IP_ADDRESS': 18,\n", + " 'US_SSN': 18,\n", + " 'US_DRIVER_LICENSE': 9})" ] }, - "execution_count": 18, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -663,41 +657,55 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Full text: The bus station is on Via Pasquale Scura 127\n", - "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n", + "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n", + "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n", "\n", - "Full text: Leigha Mackay\\n\\nLa Sagne\\nSwitzerland\n", - "Spans: [Span(type: country, value: Switzerland, char_span: [27: 38]), Span(type: city, value: La Sagne, char_span: [17: 25]), Span(type: name, value: Leigha Mackay, char_span: [0: 13])]\n", + "Full text: The Ilta T Ryhänen version recorded for Weatherford International Inc became the first celebrity recording by a classical musician to sell one million copies. The song was awarded the seventh gold disc ever granted.\n", + "Spans: [Span(type: organization, value: Weatherford International Inc, char_span: [40: 69]), Span(type: person, value: Ilta T Ryhänen, char_span: [4: 18])]\n", "\n", - "Full text: Can someone call me on 06-82237745? I have some questions about opening an account.\n", - "Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]\n", + "Full text: We'll meet Monday at JAPAN PULP AND PAPER COMPANY LIMITED, 5931 84 Cassinia Street, GUNDAGAI\n", + "Spans: [Span(type: city, value: GUNDAGAI, char_span: [84: 92]), Span(type: street_name, value: 84 Cassinia Street, char_span: [64: 82]), Span(type: building_number, value: 5931, char_span: [59: 63]), Span(type: organization, value: JAPAN PULP AND PAPER COMPANY LIMITED, char_span: [21: 57]), Span(type: day_of_week, value: Monday, char_span: [11: 17])]\n", "\n", - "Full text: Could you please send me the last billed amount for cc 4218196001337 on my e-mail TomaszJablonski@gustr.com?\n", - "Spans: [Span(type: email, value: TomaszJablonski@gustr.com, char_span: [82: 107]), Span(type: credit_card_number, value: 4218196001337, char_span: [55: 68])]\n", + "Full text: Can someone call me on 0377 7151585? I have some questions about opening an account.\n", + "Spans: [Span(type: phone_number, value: 0377 7151585, char_span: [23: 35])]\n", "\n", - "Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia\n", - "Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, value: 254 Damvergi Street, char_span: [41: 60]), Span(type: building_number, value: 083, char_span: [37: 40]), Span(type: first_name, value: Csanád, char_span: [17: 23]), Span(type: first_name_male, value: Csanád, char_span: [0: 6])]\n", + "Full text: Leena R Filppula\\nTelephone and Data Systems Inc.\\nServidão Fernando Albrecht 673 Szemere Radial\n", + " Suite 538\n", + " Joinville\n", + " Brazil 27518\\n032 627 37 30 office\\n(07700)331659 fax\\n+41 47 717 21 68 mobile\\n\n", + "Spans: [Span(type: phone_number, value: +41 47 717 21 68, char_span: [175: 191]), Span(type: phone_number, value: (07700)331659, char_span: [156: 169]), Span(type: phone_number, value: 032 627 37 30, char_span: [134: 147]), Span(type: address, value: Servidão Fernando Albrecht 673 Szemere Radial\n", + " Suite 538\n", + " Joinville\n", + " Brazil 27518, char_span: [51: 132]), Span(type: organization, value: Telephone and Data Systems Inc., char_span: [18: 49]), Span(type: name, value: Leena R Filppula, char_span: [0: 16])]\n", "\n", - "Full text: You can tell Cecilie was a huge Cecilie Josefsen fan. Written when he was 21.\n", - "Spans: [Span(type: age, value: 21, char_span: [74: 76]), Span(type: person, value: cecilie josefsen, char_span: [32: 48]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n", + "Full text: Bot: Where would you like this to be sent to? User: 11129 Rua Forno 76\n", + " Suite 599\n", + " Quinta do Passadouro de Cima\n", + " Portugal 66984\n", + "Spans: [Span(type: address, value: 11129 Rua Forno 76\n", + " Suite 599\n", + " Quinta do Passadouro de Cima\n", + " Portugal 66984, char_span: [52: 127])]\n", "\n", - "Full text: Who's coming to Switzerland with me?\n", - "Spans: [Span(type: country, value: Switzerland, char_span: [16: 27])]\n", + "Full text: One of the most depressing songs on the list. He's injured from the waist down from Spain, but Alexander just has to get laid. Don't go to town, Christopher!\n", + "Spans: [Span(type: first_name, value: Christopher, char_span: [145: 156]), Span(type: first_name, value: Alexander, char_span: [95: 104]), Span(type: country, value: Spain, char_span: [84: 89])]\n", "\n", - "Full text: Helena Carlsen\\n\\n637 Strojírenská 1006\\n Suite 026\\n Svratka\\n Czech Republic 45098\n", - "Spans: [Span(type: postcode, value: 45098, char_span: [79: 84]), Span(type: country, value: Czech Republic, char_span: [64: 78]), Span(type: city, value: Svratka, char_span: [54: 61]), Span(type: secondary_address, value: Suite 026, char_span: [42: 51]), Span(type: street_name, value: Strojírenská 1006, char_span: [22: 39]), Span(type: building_number, value: 637, char_span: [18: 21]), Span(type: person, value: Helena Carlsen, char_span: [0: 14])]\n", + "Full text: Our offices are located at Romina and Müürivahe 27\n", + "Spans: [Span(type: address, value: Romina and Müürivahe 27, char_span: [27: 50])]\n", "\n", - "Full text: Francesca Freeman\\n\\n35116 Rua Arapiraca 1943\\n Apt. 559\\n Teixeira de Freitas\\n Brazil 35172\\n(73) 4746-3459-Office\\,781-618-4959-Fax\n", - "Spans: [Span(type: phone_number, value: 781-618-4959, char_span: [118: 130]), Span(type: phone_number, value: (73) 4746-3459, char_span: [95: 109]), Span(type: postcode, value: 35172, char_span: [88: 93]), Span(type: country, value: Brazil, char_span: [81: 87]), Span(type: city, value: Teixeira de Freitas, char_span: [59: 78]), Span(type: secondary_address, value: Apt. 559, char_span: [48: 56]), Span(type: street_name, value: Rua Arapiraca 1943, char_span: [27: 45]), Span(type: building_number, value: 35116, char_span: [21: 26]), Span(type: person, value: Francesca Freeman, char_span: [0: 17])]\n", + "Full text: Meet me at Unit 8161 Box 6817\n", + "DPO AE 26241\n", + "Spans: [Span(type: address, value: Unit 8161 Box 6817\n", + "DPO AE 26241, char_span: [11: 42])]\n", "\n", - "Full text: 3... 2... 1... liftoff!\n", + "Full text: How do I open my credit card statement?\n", "Spans: []\n", "\n" ] @@ -721,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 14, "metadata": { "is_executing": true, "pycharm": { @@ -735,9 +743,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'../data/generated_size_1500_date_January_06_2025.json'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "output_file" ] @@ -755,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 16, "metadata": { "is_executing": true, "pycharm": { @@ -767,7 +786,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1500/1500 [00:00<00:00, 47248.41it/s]\n" + " 0%| | 0/1500 [00:00The\n", " DET\n", " DT\n", - " 163\n", + " 110\n", " O\n", " 0\n", " \n", " \n", " 1\n", - " bus\n", + " title\n", " NOUN\n", " NN\n", - " 163\n", + " 110\n", " O\n", " 0\n", " \n", " \n", " 2\n", - " station\n", - " NOUN\n", - " NN\n", - " 163\n", - " O\n", - " 0\n", - " \n", - " \n", - " 3\n", - " is\n", - " AUX\n", + " refers\n", + " VERB\n", " VBZ\n", - " 163\n", + " 110\n", " O\n", " 0\n", " \n", " \n", - " 4\n", - " on\n", + " 3\n", + " to\n", " ADP\n", " IN\n", - " 163\n", + " 110\n", " O\n", " 0\n", " \n", " \n", - " 5\n", - " Via\n", + " 4\n", + " Riddersporen\n", " PROPN\n", " NNP\n", - " 163\n", + " 110\n", " B-street_name\n", " 0\n", " \n", " \n", - " 6\n", - " Pasquale\n", - " PROPN\n", - " NNP\n", - " 163\n", + " 5\n", + " 1\n", + " NUM\n", + " CD\n", + " 110\n", " I-street_name\n", " 0\n", " \n", " \n", + " 6\n", + " street\n", + " NOUN\n", + " NN\n", + " 110\n", + " O\n", + " 0\n", + " \n", + " \n", " 7\n", - " Scura\n", - " PROPN\n", - " NNP\n", - " 163\n", - " I-street_name\n", + " in\n", + " ADP\n", + " IN\n", + " 110\n", + " O\n", " 0\n", " \n", " \n", " 8\n", - " 127\n", - " NUM\n", - " CD\n", - " 163\n", - " I-street_name\n", + " STAVANGER\n", + " PROPN\n", + " NNP\n", + " 110\n", + " B-city\n", " 0\n", " \n", " \n", " 9\n", - " Leigha\n", - " VERB\n", - " VB\n", - " 189\n", - " B-name\n", - " 1\n", + " .\n", + " PUNCT\n", + " .\n", + " 110\n", + " O\n", + " 0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " text pos tag template_id label sentence\n", - "0 The DET DT 163 O 0\n", - "1 bus NOUN NN 163 O 0\n", - "2 station NOUN NN 163 O 0\n", - "3 is AUX VBZ 163 O 0\n", - "4 on ADP IN 163 O 0\n", - "5 Via PROPN NNP 163 B-street_name 0\n", - "6 Pasquale PROPN NNP 163 I-street_name 0\n", - "7 Scura PROPN NNP 163 I-street_name 0\n", - "8 127 NUM CD 163 I-street_name 0\n", - "9 Leigha VERB VB 189 B-name 1" + " text pos tag template_id label sentence\n", + "0 The DET DT 110 O 0\n", + "1 title NOUN NN 110 O 0\n", + "2 refers VERB VBZ 110 O 0\n", + "3 to ADP IN 110 O 0\n", + "4 Riddersporen PROPN NNP 110 B-street_name 0\n", + "5 1 NUM CD 110 I-street_name 0\n", + "6 street NOUN NN 110 O 0\n", + "7 in ADP IN 110 O 0\n", + "8 STAVANGER PROPN NNP 110 B-city 0\n", + "9 . PUNCT . 110 O 0" ] }, - "execution_count": 23, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -920,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 17, "metadata": { "is_executing": true, "pycharm": {