diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 71e23b2..eaf304c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -8,14 +8,14 @@ pool:
vmImage: 'ubuntu-latest'
strategy:
matrix:
- Python38:
- python.version: '3.8'
Python39:
python.version: '3.9'
Python310:
python.version: '3.10'
Python311:
python.version: '3.11'
+ Python312:
+ python.version: '3.12'
steps:
- task: UsePythonVersion@0
inputs:
diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
index f685cc1..78da83c 100644
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"is_executing": true
},
@@ -29,9 +29,8 @@
"from pathlib import Path\n",
"from typing import Dict, List\n",
"\n",
- "import numpy as np\n",
"import pandas as pd\n",
- "import tqdm\n",
+ "import numpy as np\n",
"\n",
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.data_generator import PresidioSentenceFaker"
@@ -70,11 +69,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"is_executing": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Using default entity providers\n",
+ "Using default entity mapping between the entities in the templates and the ones in the output dataset\n",
+ "Using default provider aliases\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3959.88it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Please send it to {{STREET_ADDRESS}}\n",
+ "[Span(type: address, value: the corner of Καλαμπάκα 33 and Stefan Land, char_span: [18: 60])]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
"source": [
"sentence_templates = [\n",
" \"My name is {{name}}\",\n",
@@ -83,7 +114,9 @@
"]\n",
"\n",
"\n",
- "sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05, sentence_templates=sentence_templates)\n",
+ "sentence_faker = PresidioSentenceFaker('en_US', \n",
+ " lower_case_ratio=0.05, \n",
+ " sentence_templates=sentence_templates)\n",
"fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)\n",
"\n",
"# Print the spans of the first sample\n",
@@ -103,7 +136,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"is_executing": true,
"scrolled": true
@@ -138,18 +171,242 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Using default entity providers\n",
+ "Using default entity mapping between the entities in the templates and the ones in the output dataset\n",
+ "Using default provider aliases\n"
+ ]
+ }
+ ],
"source": [
"sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " number | \n",
+ " gender | \n",
+ " nationality | \n",
+ " prefix | \n",
+ " first_name | \n",
+ " middle_initial | \n",
+ " last_name | \n",
+ " street_name | \n",
+ " city | \n",
+ " state_abbr | \n",
+ " ... | \n",
+ " company | \n",
+ " domain_name | \n",
+ " person | \n",
+ " name | \n",
+ " first_name_female | \n",
+ " first_name_male | \n",
+ " prefix_female | \n",
+ " prefix_male | \n",
+ " last_name_female | \n",
+ " last_name_male | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " female | \n",
+ " Czech | \n",
+ " Mrs. | \n",
+ " Marie | \n",
+ " J | \n",
+ " Hamanová | \n",
+ " P.O. Box 255 | \n",
+ " Kangerlussuaq | \n",
+ " QE | \n",
+ " ... | \n",
+ " Simple Solutions | \n",
+ " MarathonDancing.gl | \n",
+ " Marie Hamanová | \n",
+ " Marie Hamanová | \n",
+ " Marie | \n",
+ " | \n",
+ " Mrs. | \n",
+ " | \n",
+ " Hamanová | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " female | \n",
+ " French | \n",
+ " Ms. | \n",
+ " Patricia | \n",
+ " G | \n",
+ " Desrosiers | \n",
+ " Avenida Noruega 42 | \n",
+ " Vila Real | \n",
+ " VR | \n",
+ " ... | \n",
+ " Formula Gray | \n",
+ " LostMillions.com.pt | \n",
+ " Patricia G. Desrosiers | \n",
+ " Patricia G. Desrosiers | \n",
+ " Patricia | \n",
+ " | \n",
+ " Ms. | \n",
+ " | \n",
+ " Desrosiers | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " female | \n",
+ " American | \n",
+ " Ms. | \n",
+ " Debra | \n",
+ " O | \n",
+ " Neal | \n",
+ " 1659 Hoog St | \n",
+ " Brakpan | \n",
+ " GA | \n",
+ " ... | \n",
+ " Dahlkemper's | \n",
+ " MediumTube.co.za | \n",
+ " Debra O. Neal | \n",
+ " Debra O. Neal | \n",
+ " Debra | \n",
+ " | \n",
+ " Ms. | \n",
+ " | \n",
+ " Neal | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " male | \n",
+ " French | \n",
+ " Mr. | \n",
+ " Peverell | \n",
+ " C | \n",
+ " Racine | \n",
+ " 183 Epimenidou Street | \n",
+ " Limassol | \n",
+ " LI | \n",
+ " ... | \n",
+ " Quickbiz | \n",
+ " ImproveLook.com.cy | \n",
+ " Peverell C. Racine | \n",
+ " Peverell C. Racine | \n",
+ " | \n",
+ " Peverell | \n",
+ " | \n",
+ " Mr. | \n",
+ " | \n",
+ " Racine | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " female | \n",
+ " Slovenian | \n",
+ " Mrs. | \n",
+ " Iolanda | \n",
+ " S | \n",
+ " Tratnik | \n",
+ " Karu põik 61 | \n",
+ " Pärnu | \n",
+ " PR | \n",
+ " ... | \n",
+ " Dubrow's Cafeteria | \n",
+ " PostTan.com.ee | \n",
+ " Iolanda S. Tratnik | \n",
+ " Iolanda S. Tratnik | \n",
+ " Iolanda | \n",
+ " | \n",
+ " Mrs. | \n",
+ " | \n",
+ " Tratnik | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " number gender nationality prefix first_name middle_initial last_name \\\n",
+ "0 1 female Czech Mrs. Marie J Hamanová \n",
+ "1 2 female French Ms. Patricia G Desrosiers \n",
+ "2 3 female American Ms. Debra O Neal \n",
+ "3 4 male French Mr. Peverell C Racine \n",
+ "4 5 female Slovenian Mrs. Iolanda S Tratnik \n",
+ "\n",
+ " street_name city state_abbr ... company \\\n",
+ "0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n",
+ "1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n",
+ "2 1659 Hoog St Brakpan GA ... Dahlkemper's \n",
+ "3 183 Epimenidou Street Limassol LI ... Quickbiz \n",
+ "4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n",
+ "\n",
+ " domain_name person name \\\n",
+ "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n",
+ "1 LostMillions.com.pt Patricia G. Desrosiers Patricia G. Desrosiers \n",
+ "2 MediumTube.co.za Debra O. Neal Debra O. Neal \n",
+ "3 ImproveLook.com.cy Peverell C. Racine Peverell C. Racine \n",
+ "4 PostTan.com.ee Iolanda S. Tratnik Iolanda S. Tratnik \n",
+ "\n",
+ " first_name_female first_name_male prefix_female prefix_male \\\n",
+ "0 Marie Mrs. \n",
+ "1 Patricia Ms. \n",
+ "2 Debra Ms. \n",
+ "3 Peverell Mr. \n",
+ "4 Iolanda Mrs. \n",
+ "\n",
+ " last_name_female last_name_male \n",
+ "0 Hamanová \n",
+ "1 Desrosiers \n",
+ "2 Neal \n",
+ "3 Racine \n",
+ "4 Tratnik \n",
+ "\n",
+ "[5 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.DataFrame(sentence_faker._sentence_faker.records).head()"
]
@@ -159,16 +416,53 @@
"metadata": {},
"source": [
"`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.\n",
- "These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`"
+ "These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`\n",
+ "\n",
+ "It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, \n",
+ "and calling `add_provider` on the `PresidioSentenceFaker` instance.\n",
+ "For example:"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import random\n",
+ "from faker.providers import BaseProvider\n",
+ "\n",
+ "class MarsIdProvider(BaseProvider):\n",
+ " def mars_id(self):\n",
+ " # Generate a random row number between 1 and 50\n",
+ " row = random.randint(1, 50)\n",
+ " # Generate a random letter for the seat location from A-K\n",
+ " location = random.choice('ABCDEFGHIJK')\n",
+ " # Return the seat in the format \"row-letter\" (e.g., \"25A\")\n",
+ " return f\"{row}{location}\"\n",
+ "\n",
+ "sentence_faker.add_provider(MarsIdProvider)\n",
+ "# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"metadata": {
"is_executing": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from presidio_evaluator.data_generator.faker_extensions.providers import *\n",
"\n",
@@ -196,17 +490,33 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('name', 'person'),\n",
+ " ('credit_card_number', 'credit_card'),\n",
+ " ('date_of_birth', 'birthday')]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Create entity aliases (e.g. if your provider supports \"name\" but templates contain \"person\").\n",
- "PresidioSentenceFaker.PROVIDER_ALIASES"
+ "provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES\n",
+ "provider_aliases\n",
+ "\n",
+ "# To customize, call `PresidioSentenceFaker(locale=\"en_US\",...,provider_aliases=provider_aliases)`"
]
},
{
@@ -222,9 +532,33 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 13821.21it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n",
+ "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
"source": [
"fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)\n",
"pprint.pprint(fake_records[0])"
@@ -239,12 +573,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"is_executing": true,
"scrolled": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total: 1500\n",
+ "Avg # of records per template: 7.142857142857143\n",
+ "Median # of records per template: 7.0\n",
+ "Std: 2.6812526263406258\n"
+ ]
+ }
+ ],
"source": [
"count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
"\n",
@@ -267,14 +612,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'PERSON': 874,\n",
+ " 'STREET_ADDRESS': 609,\n",
+ " 'GPE': 442,\n",
+ " 'ORGANIZATION': 253,\n",
+ " 'CREDIT_CARD': 131,\n",
+ " 'PHONE_NUMBER': 117,\n",
+ " 'DATE_TIME': 106,\n",
+ " 'TITLE': 91,\n",
+ " 'AGE': 79,\n",
+ " 'NRP': 66,\n",
+ " 'ZIP_CODE': 42,\n",
+ " 'EMAIL_ADDRESS': 33,\n",
+ " 'DOMAIN_NAME': 30,\n",
+ " 'IBAN_CODE': 26,\n",
+ " 'IP_ADDRESS': 18,\n",
+ " 'US_SSN': 18,\n",
+ " 'US_DRIVER_LICENSE': 9})"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"count_per_entity = Counter()\n",
"for record in fake_records:\n",
@@ -285,33 +657,60 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
- "source": [
- "import json\n",
- "import dataclasses\n",
- "def get_json(result) -> str:\n",
- " spans_dict = json.dumps([dataclasses.asdict(span) for span in result.spans])\n",
- " return dict(fake=result.fake, spans=spans_dict, template=result.template, template_id=result.template_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "is_executing": true
- },
- "outputs": [],
- "source": [
- "len(fake_records)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n",
+ "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n",
+ "\n",
+ "Full text: The Ilta T Ryhänen version recorded for Weatherford International Inc became the first celebrity recording by a classical musician to sell one million copies. The song was awarded the seventh gold disc ever granted.\n",
+ "Spans: [Span(type: organization, value: Weatherford International Inc, char_span: [40: 69]), Span(type: person, value: Ilta T Ryhänen, char_span: [4: 18])]\n",
+ "\n",
+ "Full text: We'll meet Monday at JAPAN PULP AND PAPER COMPANY LIMITED, 5931 84 Cassinia Street, GUNDAGAI\n",
+ "Spans: [Span(type: city, value: GUNDAGAI, char_span: [84: 92]), Span(type: street_name, value: 84 Cassinia Street, char_span: [64: 82]), Span(type: building_number, value: 5931, char_span: [59: 63]), Span(type: organization, value: JAPAN PULP AND PAPER COMPANY LIMITED, char_span: [21: 57]), Span(type: day_of_week, value: Monday, char_span: [11: 17])]\n",
+ "\n",
+ "Full text: Can someone call me on 0377 7151585? I have some questions about opening an account.\n",
+ "Spans: [Span(type: phone_number, value: 0377 7151585, char_span: [23: 35])]\n",
+ "\n",
+ "Full text: Leena R Filppula\\nTelephone and Data Systems Inc.\\nServidão Fernando Albrecht 673 Szemere Radial\n",
+ " Suite 538\n",
+ " Joinville\n",
+ " Brazil 27518\\n032 627 37 30 office\\n(07700)331659 fax\\n+41 47 717 21 68 mobile\\n\n",
+ "Spans: [Span(type: phone_number, value: +41 47 717 21 68, char_span: [175: 191]), Span(type: phone_number, value: (07700)331659, char_span: [156: 169]), Span(type: phone_number, value: 032 627 37 30, char_span: [134: 147]), Span(type: address, value: Servidão Fernando Albrecht 673 Szemere Radial\n",
+ " Suite 538\n",
+ " Joinville\n",
+ " Brazil 27518, char_span: [51: 132]), Span(type: organization, value: Telephone and Data Systems Inc., char_span: [18: 49]), Span(type: name, value: Leena R Filppula, char_span: [0: 16])]\n",
+ "\n",
+ "Full text: Bot: Where would you like this to be sent to? User: 11129 Rua Forno 76\n",
+ " Suite 599\n",
+ " Quinta do Passadouro de Cima\n",
+ " Portugal 66984\n",
+ "Spans: [Span(type: address, value: 11129 Rua Forno 76\n",
+ " Suite 599\n",
+ " Quinta do Passadouro de Cima\n",
+ " Portugal 66984, char_span: [52: 127])]\n",
+ "\n",
+ "Full text: One of the most depressing songs on the list. He's injured from the waist down from Spain, but Alexander just has to get laid. Don't go to town, Christopher!\n",
+ "Spans: [Span(type: first_name, value: Christopher, char_span: [145: 156]), Span(type: first_name, value: Alexander, char_span: [95: 104]), Span(type: country, value: Spain, char_span: [84: 89])]\n",
+ "\n",
+ "Full text: Our offices are located at Romina and Müürivahe 27\n",
+ "Spans: [Span(type: address, value: Romina and Müürivahe 27, char_span: [27: 50])]\n",
+ "\n",
+ "Full text: Meet me at Unit 8161 Box 6817\n",
+ "DPO AE 26241\n",
+ "Spans: [Span(type: address, value: Unit 8161 Box 6817\n",
+ "DPO AE 26241, char_span: [11: 42])]\n",
+ "\n",
+ "Full text: How do I open my credit card statement?\n",
+ "Spans: []\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"for record in fake_records[:10]:\n",
" print(record)"
@@ -330,7 +729,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"is_executing": true,
"pycharm": {
@@ -344,9 +743,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'../data/generated_size_1500_date_January_06_2025.json'"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"output_file"
]
@@ -364,30 +774,204 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/1500 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "loading model en_core_web_sm\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 1500/1500 [00:03<00:00, 386.94it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " pos | \n",
+ " tag | \n",
+ " template_id | \n",
+ " label | \n",
+ " sentence | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " The | \n",
+ " DET | \n",
+ " DT | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " title | \n",
+ " NOUN | \n",
+ " NN | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " refers | \n",
+ " VERB | \n",
+ " VBZ | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " to | \n",
+ " ADP | \n",
+ " IN | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Riddersporen | \n",
+ " PROPN | \n",
+ " NNP | \n",
+ " 110 | \n",
+ " B-street_name | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 1 | \n",
+ " NUM | \n",
+ " CD | \n",
+ " 110 | \n",
+ " I-street_name | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " street | \n",
+ " NOUN | \n",
+ " NN | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " in | \n",
+ " ADP | \n",
+ " IN | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " STAVANGER | \n",
+ " PROPN | \n",
+ " NNP | \n",
+ " 110 | \n",
+ " B-city | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " . | \n",
+ " PUNCT | \n",
+ " . | \n",
+ " 110 | \n",
+ " O | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text pos tag template_id label sentence\n",
+ "0 The DET DT 110 O 0\n",
+ "1 title NOUN NN 110 O 0\n",
+ "2 refers VERB VBZ 110 O 0\n",
+ "3 to ADP IN 110 O 0\n",
+ "4 Riddersporen PROPN NNP 110 B-street_name 0\n",
+ "5 1 NUM CD 110 I-street_name 0\n",
+ "6 street NOUN NN 110 O 0\n",
+ "7 in ADP IN 110 O 0\n",
+ "8 STAVANGER PROPN NNP 110 B-city 0\n",
+ "9 . PUNCT . 110 O 0"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "conll = InputSample.create_conll_dataset(fake_records)"
+ "conll = InputSample.create_conll_dataset(dataset=fake_records)\n",
+ "conll.head(10)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_06_2025.tsv\n"
+ ]
+ }
+ ],
"source": [
- "conll.to_csv(output_conll, sep=\"\\t\")"
+ "conll.to_csv(output_conll, sep=\"\\t\")\n",
+ "print(f\"CoNLL2003 dataset structure output location: {output_conll}\")"
]
},
{
@@ -396,7 +980,7 @@
"source": [
"### Next steps\n",
"\n",
- "- Evaluate Presidio using this fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
+ "- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)\n",
"- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)"
]
@@ -417,9 +1001,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-research",
"language": "python",
- "name": "python3"
+ "name": "presidio_research"
},
"language_info": {
"codemirror_mode": {
@@ -431,7 +1015,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.15"
+ "version": "3.9.6"
}
},
"nbformat": 4,
diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py
index fee59bc..087052c 100644
--- a/presidio_evaluator/data_generator/faker_extensions/sentences.py
+++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py
@@ -195,9 +195,9 @@ def parse(
return fake_pattern
except Exception as err:
raise AttributeError(
- f'Failed to generate fake data based on template "{template}".'
- f"You might need to add a new Faker provider! "
- f"{err}"
+ f'Failed to generate fake data based on template "{template}". '
+ f"Add a new Faker provider or create an alias "
+ f"for the entity name. {err}"
)
@staticmethod
diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py
index 7a85cae..abc3205 100644
--- a/presidio_evaluator/data_generator/presidio_sentence_faker.py
+++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py
@@ -1,7 +1,8 @@
import json
import random
from pathlib import Path
-from typing import List, Optional, Union, Dict
+from typing import List, Optional, Tuple, Union, Dict
+import re
import numpy as np
import pandas as pd
@@ -52,12 +53,17 @@ class PresidioSentenceFaker:
:param: entity_providers: Defaults to presidio_additional_entity_providers, a provided argument overrides this
:param: base_records: A DataFrame with entity types as columns and each row corresponding to a fake individual.
Defaults to presidio_evaluator.data_generator.faker_extensions.datasets.load_fake_person_df()
+ :param: entity_type_mapping: A dictionary mapping entity types to Presidio entity types
+ :param: provider_aliases: A dictionary mapping provider names to the given entity types.
+ Useful if the templates contain a different name for the entity type than the one supported by Faker or PresidioSentenceFaker.
:param: random_seed: A seed to make results reproducible between runs
"""
- PROVIDER_ALIASES = dict(
- name="person", credit_card_number="credit_card", date_of_birth="birthday"
- )
+ PROVIDER_ALIASES = [
+ ("name", "person"),
+ ("credit_card_number", "credit_card"),
+ ("date_of_birth", "birthday"),
+ ]
ENTITY_TYPE_MAPPING = dict(
person="PERSON",
ip_address="IP_ADDRESS",
@@ -111,6 +117,8 @@ def __init__(
sentence_templates: Optional[List[str]] = None,
entity_providers: Optional[List[BaseProvider]] = None,
base_records: Optional[Union[pd.DataFrame, List[Dict]]] = None,
+ entity_type_mapping: Optional[Dict[str, str]] = None,
+ provider_aliases: Optional[List[Tuple[str, str]]] = None,
random_seed: Optional[SeedType] = None,
):
self._sentence_templates = sentence_templates
@@ -120,6 +128,7 @@ def __init__(
for line in presidio_templates_file_path.read_text().splitlines()
]
if entity_providers is None:
+ print("Using default entity providers")
entity_providers = presidio_additional_entity_providers
if base_records is None:
base_records = load_fake_person_df()
@@ -131,33 +140,101 @@ def __init__(
self._sentence_faker.add_provider(entity_provider)
self.seed(random_seed)
- for provider, alias in self.PROVIDER_ALIASES.items():
+
+ if not entity_type_mapping:
+ print(
+ "Using default entity mapping between the entities "
+ "in the templates and the ones in the output dataset"
+ )
+ entity_type_mapping = self.ENTITY_TYPE_MAPPING
+
+ self._entity_type_mapping = entity_type_mapping
+
+ if not provider_aliases:
+ print("Using default provider aliases")
+ provider_aliases = self.PROVIDER_ALIASES
+
+ for provider, alias in provider_aliases:
self._sentence_faker.add_provider_alias(
provider_name=provider, new_name=alias
)
self.fake_sentence_results = None
def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]:
+ """Generate fake sentences based on the templates, input data and entity providers."""
self.fake_sentence_results = []
# Map faker generated entity types to Presidio entity types
for _ in tqdm(range(num_samples), desc="Sampling"):
template_id = random.choice(range(len(self._sentence_templates)))
template = self._sentence_templates[template_id]
+ template = self._preprocess_template(template)
fake_sentence_result = self._sentence_faker.parse(template, template_id)
for span in fake_sentence_result.spans:
- span.type = self.ENTITY_TYPE_MAPPING[span.type]
- for key, value in self.ENTITY_TYPE_MAPPING.items():
+ if span.type in self._entity_type_mapping.keys():
+ # Use the mapped entity type if exists
+ span.type = self._entity_type_mapping[span.type]
+ else:
+ # Otherwise, capitalize the entity type and add to the mapping
+ print(
+ f"Warning: Non-mapped entity type found: {span.type}. "
+ f"Non-mapped entities will be mapped to {span.type.upper()} "
+ f"in the output dataset. If you prefer a different mapping, "
+ f"pass the `entity_type_mapping` argument with a mapping for this entity type."
+ )
+ self._entity_type_mapping[span.type] = span.type.upper()
+ for key, value in self._entity_type_mapping.items():
fake_sentence_result.masked = fake_sentence_result.masked.replace(
"{{%s}}" % key, "{{%s}}" % value
)
self.fake_sentence_results.append(fake_sentence_result)
return self.fake_sentence_results
- def seed(self, seed_value=42):
+ @staticmethod
+ def seed(seed_value=42) -> None:
+ """Seed the faker and random modules for reproducibility."""
Faker.seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
+ def add_provider(self, provider: BaseProvider) -> None:
+ """
+ Add a provider to the sentence faker
+ :param provider: A faker provider inheriting from BaseProvider
+ """
+ self._sentence_faker.add_provider(provider)
+
+ def add_provider_alias(self, provider_name: str, new_name: str) -> None:
+ """
+ Adds a copy of a provider, with a different name
+ :param provider_name: Name of original provider
+ :param new_name: New name
+ :example:
+ >>>self.add_provider_alias(provider_name="name", new_name="person")
+ >>>self.person()
+ """
+ self._sentence_faker.add_provider_alias(
+ provider_name=provider_name, new_name=new_name
+ )
+
+ def add_entity_type_mapping(
+ self, input_entity_type: str, output_entity_type: str
+ ) -> None:
+ self._entity_type_mapping[input_entity_type] = output_entity_type
+
+ @staticmethod
+ def _preprocess_template(template: str):
+ """Lowercase the entity names within double curly braces in the template, and replace < and > with {{ and }}.""" # noqa: E501
+
+ def lowercase_within_braces(s):
+ return re.sub(
+ r"{{(.*?)}}", lambda match: f"{{{{{match.group(1).lower()}}}}}", s
+ )
+
+ template = template.replace("<", "{{").replace(">", "}}")
+ template = lowercase_within_braces(template)
+
+ return template
+
if __name__ == "__main__":
sentence_faker = PresidioSentenceFaker(
diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py
index 9e52601..0dd7d2f 100644
--- a/presidio_evaluator/data_objects.py
+++ b/presidio_evaluator/data_objects.py
@@ -179,7 +179,13 @@ def from_json(cls, data, **kwargs):
data["spans"] = [Span.from_json(span) for span in data["spans"]]
return cls(**data, create_tags_from_span=True, **kwargs)
- def get_tags(self, scheme="IOB", model_version="en_core_web_sm"):
+ def get_tags(self, scheme: str = "IOB", model_version: str = "en_core_web_sm"):
+ """Extract the tokens and tags from the spans.
+
+ :param scheme: IO, BIO or BILUO
+ :param model_version: The name of the spaCy model to use for tokenization
+ """
+
start_indices = [span.start_position for span in self.spans]
end_indices = [span.end_position for span in self.spans]
tags = [span.entity_type for span in self.spans]
@@ -192,19 +198,27 @@ def get_tags(self, scheme="IOB", model_version="en_core_web_sm"):
starts=start_indices,
ends=end_indices,
tokens=tokens,
+ token_model_version=model_version,
)
return tokens, labels
- def to_conll(self, translate_tags: bool) -> List[Dict[str, Any]]:
+ def to_conll(
+ self, translate_tags: bool, tokenizer: str = "en_core_web_sm"
+ ) -> List[Dict[str, Any]]:
"""
Turns a list of InputSample objects to a dictionary
containing text, pos, tag, template_id and label.
:param translate_tags: Whether to translate tags using the PRESIDIO_SPACY_ENTITIES dictionary
+ :param tokenizer: The name of the spaCy model to use for tokenization
:return: Dict
"""
conll = []
+
+ if len(self.tokens) == 0:
+ self.tokens, self.tags = self.get_tags(model_version=tokenizer)
+
for i, token in enumerate(self.tokens):
if translate_tags:
label = self.translate_tag(
@@ -233,7 +247,7 @@ def create_conll_dataset(
dataset: List["InputSample"],
translate_tags=False,
to_bio=True,
- token_model_version="en_core_web_sm",
+ tokenizer: str = "en_core_web_sm",
) -> pd.DataFrame:
if len(dataset) <= 1:
raise ValueError("Dataset should contain multiple records")
@@ -243,7 +257,7 @@ def create_conll_dataset(
for sample in tqdm(dataset):
if to_bio:
sample.biluo_to_bio()
- conll = sample.to_conll(translate_tags=translate_tags)
+ conll = sample.to_conll(translate_tags=translate_tags, tokenizer=tokenizer)
for token in conll:
token["sentence"] = i
conlls.append(token)
diff --git a/pyproject.toml b/pyproject.toml
index 99cd13a..eca721e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "presidio_evaluator"
-version = "0.2.0"
+version = "0.2.1"
description = ""
authors = ["Microsoft"]
readme = "README.md"
diff --git a/tests/test_presidio_sentence_faker.py b/tests/test_presidio_sentence_faker.py
index 206d5fd..34d2d9e 100644
--- a/tests/test_presidio_sentence_faker.py
+++ b/tests/test_presidio_sentence_faker.py
@@ -24,13 +24,13 @@ def test_generate_new_fake_sentences(num_sentences: int):
expected_providers = deepcopy(default_faker_providers)
expected_providers.extend(presidio_providers)
- expected_providers.extend([standard_faker.__getattr__(key)
- for key in PresidioSentenceFaker.PROVIDER_ALIASES.keys()])
+ expected_providers.extend([standard_faker.__getattr__(alias[0])
+ for alias in PresidioSentenceFaker.PROVIDER_ALIASES])
actual_providers = sentence_faker._sentence_faker.providers
num_aliases = len(PresidioSentenceFaker.PROVIDER_ALIASES)
actual_num_providers = len(actual_providers)
- expected_aliases = set(getattr(standard_faker, provider_name)
- for provider_name in PresidioSentenceFaker.PROVIDER_ALIASES.keys())
+ expected_aliases = set(getattr(standard_faker, provider_name[0])
+ for provider_name in PresidioSentenceFaker.PROVIDER_ALIASES)
assert actual_num_providers == len(expected_providers), \
f'Expected {len(presidio_providers)} presidio providers to be used and {num_aliases} aliases. ' \
f'Faker has been extended with {actual_num_providers - len(default_faker_providers)} providers/aliases. ' \
@@ -43,3 +43,14 @@ def test_generate_new_fake_sentences(num_sentences: int):
assert fake_sentence_result.full_text
assert fake_sentence_result.masked
assert fake_sentence_result.template_id >= 0
+
+
+@pytest.mark.parametrize("template_before, template_after", [
+ ("I just moved to {{CiTY}} from {{Country}}",
+ "I just moved to {{city}} from {{country}}"),
+ ("I just moved to from .",
+ "I just moved to {{city}} from {{country}}.")
+])
+def test_preprocess_template(template_before: str, template_after: str):
+ sentence_faker = PresidioSentenceFaker(locale='en', lower_case_ratio=0)
+ assert sentence_faker._preprocess_template(template_before) == template_after