From c1f809c7b3c787b5f885af66a9ce361db89daa4f Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:47:42 +0200
Subject: [PATCH 1/9] hotfix for the PresidioSentenceFaker process

---
 notebooks/1_Generate_data.ipynb               | 726 ++++++++++++++++--
 .../faker_extensions/sentences.py             |   4 +-
 .../data_generator/presidio_sentence_faker.py |  86 ++-
 presidio_evaluator/data_objects.py            |  19 +-
 tests/test_presidio_sentence_faker.py         |   8 +-
 5 files changed, 770 insertions(+), 73 deletions(-)
diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
index f685cc1..d5b59e8 100644
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "is_executing": true
    },
@@ -29,9 +29,8 @@
     "from pathlib import Path\n",
     "from typing import Dict, List\n",
     "\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
-    "import tqdm\n",
+    "import numpy as np\n",
     "\n",
     "from presidio_evaluator import InputSample\n",
     "from presidio_evaluator.data_generator import PresidioSentenceFaker"
@@ -74,7 +73,33 @@
    "metadata": {
     "is_executing": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sampling: 100%|██████████| 10/10 [00:00<00:00, 12706.16it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Non-mapped entity type found: country Non-mapped entities will be mapped to COUNTRY in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n",
+      "Non-mapped entity type found: name Non-mapped entities will be mapped to NAME in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n",
+      "Non-mapped entity type found: address Non-mapped entities will be mapped to ADDRESS in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n",
+      "I just moved to {{city}} from {{COUNTRY}}\n",
+      "[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "sentence_templates = [\n",
     "    \"My name is {{name}}\",\n",
@@ -103,7 +128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "is_executing": true,
     "scrolled": true
@@ -138,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -147,9 +172,223 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>number</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>nationality</th>\n",
+       "      <th>prefix</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>middle_initial</th>\n",
+       "      <th>last_name</th>\n",
+       "      <th>street_name</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state_abbr</th>\n",
+       "      <th>...</th>\n",
+       "      <th>company</th>\n",
+       "      <th>domain_name</th>\n",
+       "      <th>person</th>\n",
+       "      <th>name</th>\n",
+       "      <th>first_name_female</th>\n",
+       "      <th>first_name_male</th>\n",
+       "      <th>prefix_female</th>\n",
+       "      <th>prefix_male</th>\n",
+       "      <th>last_name_female</th>\n",
+       "      <th>last_name_male</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>Czech</td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td>Marie</td>\n",
+       "      <td>J</td>\n",
+       "      <td>Hamanová</td>\n",
+       "      <td>P.O. Box 255</td>\n",
+       "      <td>Kangerlussuaq</td>\n",
+       "      <td>QE</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Simple Solutions</td>\n",
+       "      <td>MarathonDancing.gl</td>\n",
+       "      <td>Marie Hamanová</td>\n",
+       "      <td>Marie Hamanová</td>\n",
+       "      <td>Marie</td>\n",
+       "      <td></td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td></td>\n",
+       "      <td>Hamanová</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>female</td>\n",
+       "      <td>French</td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td>Patricia</td>\n",
+       "      <td>G</td>\n",
+       "      <td>Desrosiers</td>\n",
+       "      <td>Avenida Noruega 42</td>\n",
+       "      <td>Vila Real</td>\n",
+       "      <td>VR</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Formula Gray</td>\n",
+       "      <td>LostMillions.com.pt</td>\n",
+       "      <td>Patricia Desrosiers</td>\n",
+       "      <td>Patricia Desrosiers</td>\n",
+       "      <td>Patricia</td>\n",
+       "      <td></td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td></td>\n",
+       "      <td>Desrosiers</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>American</td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td>Debra</td>\n",
+       "      <td>O</td>\n",
+       "      <td>Neal</td>\n",
+       "      <td>1659 Hoog St</td>\n",
+       "      <td>Brakpan</td>\n",
+       "      <td>GA</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Dahlkemper's</td>\n",
+       "      <td>MediumTube.co.za</td>\n",
+       "      <td>Debra Neal</td>\n",
+       "      <td>Debra Neal</td>\n",
+       "      <td>Debra</td>\n",
+       "      <td></td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td></td>\n",
+       "      <td>Neal</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>male</td>\n",
+       "      <td>French</td>\n",
+       "      <td>Mr.</td>\n",
+       "      <td>Peverell</td>\n",
+       "      <td>C</td>\n",
+       "      <td>Racine</td>\n",
+       "      <td>183 Epimenidou Street</td>\n",
+       "      <td>Limassol</td>\n",
+       "      <td>LI</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Quickbiz</td>\n",
+       "      <td>ImproveLook.com.cy</td>\n",
+       "      <td>Peverell Racine</td>\n",
+       "      <td>Peverell Racine</td>\n",
+       "      <td></td>\n",
+       "      <td>Peverell</td>\n",
+       "      <td></td>\n",
+       "      <td>Mr.</td>\n",
+       "      <td></td>\n",
+       "      <td>Racine</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>female</td>\n",
+       "      <td>Slovenian</td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td>Iolanda</td>\n",
+       "      <td>S</td>\n",
+       "      <td>Tratnik</td>\n",
+       "      <td>Karu põik 61</td>\n",
+       "      <td>Pärnu</td>\n",
+       "      <td>PR</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Dubrow's Cafeteria</td>\n",
+       "      <td>PostTan.com.ee</td>\n",
+       "      <td>Iolanda Tratnik</td>\n",
+       "      <td>Iolanda Tratnik</td>\n",
+       "      <td>Iolanda</td>\n",
+       "      <td></td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td></td>\n",
+       "      <td>Tratnik</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 37 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   number  gender nationality prefix first_name middle_initial   last_name  \\\n",
+       "0       1  female       Czech   Mrs.      Marie              J    Hamanová   \n",
+       "1       2  female      French    Ms.   Patricia              G  Desrosiers   \n",
+       "2       3  female    American    Ms.      Debra              O        Neal   \n",
+       "3       4    male      French    Mr.   Peverell              C      Racine   \n",
+       "4       5  female   Slovenian   Mrs.    Iolanda              S     Tratnik   \n",
+       "\n",
+       "             street_name           city state_abbr  ...             company  \\\n",
+       "0           P.O. Box 255  Kangerlussuaq         QE  ...    Simple Solutions   \n",
+       "1     Avenida Noruega 42      Vila Real         VR  ...        Formula Gray   \n",
+       "2           1659 Hoog St        Brakpan         GA  ...        Dahlkemper's   \n",
+       "3  183 Epimenidou Street       Limassol         LI  ...            Quickbiz   \n",
+       "4           Karu põik 61          Pärnu         PR  ...  Dubrow's Cafeteria   \n",
+       "\n",
+       "           domain_name               person                 name  \\\n",
+       "0   MarathonDancing.gl       Marie Hamanová       Marie Hamanová   \n",
+       "1  LostMillions.com.pt  Patricia Desrosiers  Patricia Desrosiers   \n",
+       "2     MediumTube.co.za           Debra Neal           Debra Neal   \n",
+       "3   ImproveLook.com.cy      Peverell Racine      Peverell Racine   \n",
+       "4       PostTan.com.ee      Iolanda Tratnik      Iolanda Tratnik   \n",
+       "\n",
+       "  first_name_female first_name_male prefix_female prefix_male  \\\n",
+       "0             Marie                          Mrs.               \n",
+       "1          Patricia                           Ms.               \n",
+       "2             Debra                           Ms.               \n",
+       "3                          Peverell                       Mr.   \n",
+       "4           Iolanda                          Mrs.               \n",
+       "\n",
+       "   last_name_female last_name_male  \n",
+       "0          Hamanová                 \n",
+       "1        Desrosiers                 \n",
+       "2              Neal                 \n",
+       "3                           Racine  \n",
+       "4           Tratnik                 \n",
+       "\n",
+       "[5 rows x 37 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.DataFrame(sentence_faker._sentence_faker.records).head()"
    ]
@@ -164,11 +403,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "is_executing": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from presidio_evaluator.data_generator.faker_extensions.providers import *\n",
     "\n",
@@ -196,17 +446,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {
     "is_executing": true,
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('name', 'person'),\n",
+       " ('credit_card_number', 'credit_card'),\n",
+       " ('date_of_birth', 'birthday')]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Create entity aliases (e.g. if your provider supports \"name\" but templates contain \"person\").\n",
-    "PresidioSentenceFaker.PROVIDER_ALIASES"
+    "provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES\n",
+    "provider_aliases\n",
+    "\n",
+    "# To customize, call `PresidioSentenceFaker(locale=\"en_US\",...,provider_aliases=provider_aliases)`"
    ]
   },
   {
@@ -222,9 +488,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 8521.17it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Full text: The bus station is on Via Pasquale Scura 127\n",
+      "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)\n",
     "pprint.pprint(fake_records[0])"
@@ -239,12 +529,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {
     "is_executing": true,
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total: 1500\n",
+      "Avg # of records per template: 7.142857142857143\n",
+      "Median # of records per template: 7.0\n",
+      "Std: 2.4394713378441786\n"
+     ]
+    }
+   ],
    "source": [
     "count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
     "\n",
@@ -267,14 +568,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {
     "is_executing": true,
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'PERSON': 895,\n",
+       "         'STREET_ADDRESS': 571,\n",
+       "         'GPE': 375,\n",
+       "         'ORGANIZATION': 277,\n",
+       "         'PHONE_NUMBER': 124,\n",
+       "         'CREDIT_CARD': 115,\n",
+       "         'DATE_TIME': 110,\n",
+       "         'AGE': 77,\n",
+       "         'TITLE': 71,\n",
+       "         'NRP': 67,\n",
+       "         'EMAIL_ADDRESS': 38,\n",
+       "         'DOMAIN_NAME': 31,\n",
+       "         'ZIP_CODE': 25,\n",
+       "         'IP_ADDRESS': 17,\n",
+       "         'US_SSN': 15,\n",
+       "         'IBAN_CODE': 12,\n",
+       "         'US_DRIVER_LICENSE': 4})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "count_per_entity = Counter()\n",
     "for record in fake_records:\n",
@@ -285,33 +613,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import dataclasses\n",
-    "def get_json(result) -> str:\n",
-    "    spans_dict = json.dumps([dataclasses.asdict(span) for span in result.spans])\n",
-    "    return dict(fake=result.fake, spans=spans_dict, template=result.template, template_id=result.template_id)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "is_executing": true
-   },
-   "outputs": [],
-   "source": [
-    "len(fake_records)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Full text: The bus station is on Via Pasquale Scura 127\n",
+      "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n",
+      "\n",
+      "Full text: Leigha C Mackay\\n\\nLa Sagne\\nSwitzerland\n",
+      "Spans: [Span(type: country, value: Switzerland, char_span: [29: 40]), Span(type: city, value: La Sagne, char_span: [19: 27]), Span(type: name, value: Leigha C Mackay, char_span: [0: 15])]\n",
+      "\n",
+      "Full text: Can someone call me on 06-82237745? I have some questions about opening an account.\n",
+      "Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]\n",
+      "\n",
+      "Full text: Could you please send me the last billed amount for cc 4218196001337 on my e-mail TomaszJablonski@gustr.com?\n",
+      "Spans: [Span(type: email, value: TomaszJablonski@gustr.com, char_span: [82: 107]), Span(type: credit_card_number, value: 4218196001337, char_span: [55: 68])]\n",
+      "\n",
+      "Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia\n",
+      "Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, value: 254 Damvergi Street, char_span: [41: 60]), Span(type: building_number, value: 083, char_span: [37: 40]), Span(type: first_name, value: Csanád, char_span: [17: 23]), Span(type: first_name_male, value: Csanád, char_span: [0: 6])]\n",
+      "\n",
+      "Full text: You can tell Cecilie was a huge Cecilie K Josefsen fan. Written when he was 21.\n",
+      "Spans: [Span(type: age, value: 21, char_span: [76: 78]), Span(type: person, value: cecilie k josefsen, char_span: [32: 50]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n",
+      "\n",
+      "Full text: Who's coming to Switzerland with me?\n",
+      "Spans: [Span(type: country, value: Switzerland, char_span: [16: 27])]\n",
+      "\n",
+      "Full text: Helena Carlsen\\n\\n637 Strojírenská 1006\\n Suite 026\\n Svratka\\n Czech Republic 45098\n",
+      "Spans: [Span(type: postcode, value: 45098, char_span: [79: 84]), Span(type: country, value: Czech Republic, char_span: [64: 78]), Span(type: city, value: Svratka, char_span: [54: 61]), Span(type: secondary_address, value: Suite 026, char_span: [42: 51]), Span(type: street_name, value: Strojírenská 1006, char_span: [22: 39]), Span(type: building_number, value: 637, char_span: [18: 21]), Span(type: person, value: Helena Carlsen, char_span: [0: 14])]\n",
+      "\n",
+      "Full text: Francesca Freeman\\n\\n35116 Rua Arapiraca 1943\\n Apt. 559\\n Teixeira de Freitas\\n Brazil 35172\\n(73) 4746-3459-Office\\,781-618-4959-Fax\n",
+      "Spans: [Span(type: phone_number, value: 781-618-4959, char_span: [118: 130]), Span(type: phone_number, value: (73) 4746-3459, char_span: [95: 109]), Span(type: postcode, value: 35172, char_span: [88: 93]), Span(type: country, value: Brazil, char_span: [81: 87]), Span(type: city, value: Teixeira de Freitas, char_span: [59: 78]), Span(type: secondary_address, value: Apt. 559, char_span: [48: 56]), Span(type: street_name, value: Rua Arapiraca 1943, char_span: [27: 45]), Span(type: building_number, value: 35116, char_span: [21: 26]), Span(type: person, value: Francesca Freeman, char_span: [0: 17])]\n",
+      "\n",
+      "Full text: 3... 2... 1... liftoff!\n",
+      "Spans: []\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "for record in fake_records[:10]:\n",
     "    print(record)"
@@ -330,7 +671,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -344,9 +685,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'../data/generated_size_1500_date_January_06_2025.json'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "output_file"
    ]
@@ -364,30 +716,290 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {
     "is_executing": true,
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1500/1500 [00:00<00:00, 35869.80it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>pos</th>\n",
+       "      <th>tag</th>\n",
+       "      <th>template_id</th>\n",
+       "      <th>label</th>\n",
+       "      <th>sentence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The</td>\n",
+       "      <td>DET</td>\n",
+       "      <td>DT</td>\n",
+       "      <td>163</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>bus</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>163</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>station</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>163</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>is</td>\n",
+       "      <td>AUX</td>\n",
+       "      <td>VBZ</td>\n",
+       "      <td>163</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>on</td>\n",
+       "      <td>ADP</td>\n",
+       "      <td>IN</td>\n",
+       "      <td>163</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Via</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>NNP</td>\n",
+       "      <td>163</td>\n",
+       "      <td>B-street_name</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Pasquale</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>NNP</td>\n",
+       "      <td>163</td>\n",
+       "      <td>I-street_name</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Scura</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>NNP</td>\n",
+       "      <td>163</td>\n",
+       "      <td>I-street_name</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>127</td>\n",
+       "      <td>NUM</td>\n",
+       "      <td>CD</td>\n",
+       "      <td>163</td>\n",
+       "      <td>I-street_name</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Leigha</td>\n",
+       "      <td>VERB</td>\n",
+       "      <td>VB</td>\n",
+       "      <td>189</td>\n",
+       "      <td>B-name</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>C</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>NNP</td>\n",
+       "      <td>189</td>\n",
+       "      <td>I-name</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Mackay\\n\\nLa</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>189</td>\n",
+       "      <td>I-name</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Sagne\\nSwitzerland</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>NNP</td>\n",
+       "      <td>189</td>\n",
+       "      <td>B-city</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Can</td>\n",
+       "      <td>AUX</td>\n",
+       "      <td>MD</td>\n",
+       "      <td>57</td>\n",
+       "      <td>O</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>someone</td>\n",
+       "      <td>PRON</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>57</td>\n",
+       "      <td>O</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>call</td>\n",
+       "      <td>VERB</td>\n",
+       "      <td>VB</td>\n",
+       "      <td>57</td>\n",
+       "      <td>O</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>me</td>\n",
+       "      <td>PRON</td>\n",
+       "      <td>PRP</td>\n",
+       "      <td>57</td>\n",
+       "      <td>O</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>on</td>\n",
+       "      <td>ADP</td>\n",
+       "      <td>IN</td>\n",
+       "      <td>57</td>\n",
+       "      <td>O</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>06</td>\n",
+       "      <td>NUM</td>\n",
+       "      <td>CD</td>\n",
+       "      <td>57</td>\n",
+       "      <td>B-phone_number</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>-</td>\n",
+       "      <td>SYM</td>\n",
+       "      <td>SYM</td>\n",
+       "      <td>57</td>\n",
+       "      <td>I-phone_number</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  text    pos  tag  template_id           label  sentence\n",
+       "0                  The    DET   DT          163               O         0\n",
+       "1                  bus   NOUN   NN          163               O         0\n",
+       "2              station   NOUN   NN          163               O         0\n",
+       "3                   is    AUX  VBZ          163               O         0\n",
+       "4                   on    ADP   IN          163               O         0\n",
+       "5                  Via  PROPN  NNP          163   B-street_name         0\n",
+       "6             Pasquale  PROPN  NNP          163   I-street_name         0\n",
+       "7                Scura  PROPN  NNP          163   I-street_name         0\n",
+       "8                  127    NUM   CD          163   I-street_name         0\n",
+       "9               Leigha   VERB   VB          189          B-name         1\n",
+       "10                   C  PROPN  NNP          189          I-name         1\n",
+       "11        Mackay\\n\\nLa   NOUN   NN          189          I-name         1\n",
+       "12  Sagne\\nSwitzerland  PROPN  NNP          189          B-city         1\n",
+       "13                 Can    AUX   MD           57               O         2\n",
+       "14             someone   PRON   NN           57               O         2\n",
+       "15                call   VERB   VB           57               O         2\n",
+       "16                  me   PRON  PRP           57               O         2\n",
+       "17                  on    ADP   IN           57               O         2\n",
+       "18                  06    NUM   CD           57  B-phone_number         2\n",
+       "19                   -    SYM  SYM           57  I-phone_number         2"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "conll = InputSample.create_conll_dataset(fake_records)"
+    "conll = InputSample.create_conll_dataset(dataset=fake_records)\n",
+    "conll.head(20)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {
     "is_executing": true,
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_06_2025.tsv\n"
+     ]
+    }
+   ],
    "source": [
-    "conll.to_csv(output_conll, sep=\"\\t\")"
+    "conll.to_csv(output_conll, sep=\"\\t\")\n",
+    "print(f\"CoNLL2003 dataset structure output location: {output_conll}\")"
    ]
   },
   {
@@ -417,9 +1029,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-research",
    "language": "python",
-   "name": "python3"
+   "name": "presidio_research"
   },
   "language_info": {
    "codemirror_mode": {
@@ -431,7 +1043,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,
diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py
index fee59bc..63753b2 100644
--- a/presidio_evaluator/data_generator/faker_extensions/sentences.py
+++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py
@@ -196,7 +196,9 @@ def parse(
         except Exception as err:
             raise AttributeError(
                 f'Failed to generate fake data based on template "{template}".'
-                f"You might need to add a new Faker provider! "
+                f"You might need to add a new Faker provider "
+                f"or create an alias (map the entity name to one " 
+                f"of the existing providers)."
                 f"{err}"
             )
 
diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py
index 7a85cae..ec83332 100644
--- a/presidio_evaluator/data_generator/presidio_sentence_faker.py
+++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py
@@ -1,7 +1,8 @@
 import json
 import random
 from pathlib import Path
-from typing import List, Optional, Union, Dict
+from typing import List, Optional, Tuple, Union, Dict
+import re
 
 import numpy as np
 import pandas as pd
@@ -52,12 +53,17 @@ class PresidioSentenceFaker:
     :param: entity_providers: Defaults to presidio_additional_entity_providers, a provided argument overrides this
     :param: base_records: A DataFrame with entity types as columns and each row corresponding to a fake individual.
     Defaults to presidio_evaluator.data_generator.faker_extensions.datasets.load_fake_person_df()
+    :param: entity_type_mapping: A dictionary mapping entity types to Presidio entity types
+    :param: provider_aliases: A dictionary mapping provider names to the given entity types.
+    Useful if the templates contain a different name for the entity type than the one supported by Faker or PresidioSentenceFaker.
     :param: random_seed: A seed to make results reproducible between runs
     """
 
-    PROVIDER_ALIASES = dict(
-        name="person", credit_card_number="credit_card", date_of_birth="birthday"
-    )
+    PROVIDER_ALIASES = [
+        ("name", "person"),
+        ("credit_card_number", "credit_card"),
+        ("date_of_birth", "birthday"),
+    ]
     ENTITY_TYPE_MAPPING = dict(
         person="PERSON",
         ip_address="IP_ADDRESS",
@@ -111,6 +117,8 @@ def __init__(
         sentence_templates: Optional[List[str]] = None,
         entity_providers: Optional[List[BaseProvider]] = None,
         base_records: Optional[Union[pd.DataFrame, List[Dict]]] = None,
+        entity_type_mapping: Optional[Dict[str, str]] = None,
+        provider_aliases: Optional[List[Tuple[str, str]]] = None,
         random_seed: Optional[SeedType] = None,
     ):
         self._sentence_templates = sentence_templates
@@ -120,6 +128,7 @@ def __init__(
                 for line in presidio_templates_file_path.read_text().splitlines()
             ]
         if entity_providers is None:
+            print("Using default entity providers")
             entity_providers = presidio_additional_entity_providers
         if base_records is None:
             base_records = load_fake_person_df()
@@ -131,33 +140,94 @@ def __init__(
             self._sentence_faker.add_provider(entity_provider)
 
         self.seed(random_seed)
-        for provider, alias in self.PROVIDER_ALIASES.items():
+
+        if not entity_type_mapping:
+            print(
+                "Using default entity mapping between the entities \
+                  in the templates and the ones in the output dataset"
+            )
+            entity_type_mapping = self.ENTITY_TYPE_MAPPING
+
+        self._entity_type_mapping = entity_type_mapping
+
+        if not provider_aliases:
+            print("Using default provider aliases")
+            provider_aliases = self.PROVIDER_ALIASES
+
+        for provider, alias in provider_aliases:
             self._sentence_faker.add_provider_alias(
                 provider_name=provider, new_name=alias
             )
         self.fake_sentence_results = None
 
     def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]:
+        """Generate fake sentences based on the templates, input data and entity providers."""
         self.fake_sentence_results = []
         # Map faker generated entity types to Presidio entity types
         for _ in tqdm(range(num_samples), desc="Sampling"):
             template_id = random.choice(range(len(self._sentence_templates)))
             template = self._sentence_templates[template_id]
+            template = self._preprocess_template(template)
             fake_sentence_result = self._sentence_faker.parse(template, template_id)
             for span in fake_sentence_result.spans:
-                span.type = self.ENTITY_TYPE_MAPPING[span.type]
-            for key, value in self.ENTITY_TYPE_MAPPING.items():
+                if span.type in self._entity_type_mapping.keys():
+                    # Use the mapped entity type if exists
+                    span.type = self._entity_type_mapping[span.type]
+                else:
+                    # Otherwise, capitalize the entity type and add to the mapping
+                    print(
+                        f"Warning: Non-mapped entity type found: {span.type} "
+                        f"Non-mapped entities will be mapped to {span.type.upper()} "
+                        f"in the output dataset. If you prefer a different mapping, "
+                        f"pass the `entity_type_mapping` argument with a mapping for this entity type."
+                    )
+                    self._entity_type_mapping[span.type] = span.type.upper()
+            for key, value in self._entity_type_mapping.items():
                 fake_sentence_result.masked = fake_sentence_result.masked.replace(
                     "{{%s}}" % key, "{{%s}}" % value
                 )
             self.fake_sentence_results.append(fake_sentence_result)
         return self.fake_sentence_results
 
-    def seed(self, seed_value=42):
+    @staticmethod
+    def seed(self, seed_value=42) -> None:
+        """Seed the faker and random modules for reproducibility."""
         Faker.seed(seed_value)
         random.seed(seed_value)
         np.random.seed(seed_value)
 
+    def add_provider_alias(self, provider_name: str, new_name: str) -> None:
+        """
+        Adds a copy of a provider, with a different name
+        :param provider_name: Name of original provider
+        :param new_name: New name
+        :example:
+        >>>self.add_provider_alias(provider_name="name", new_name="person")
+        >>>self.person()
+        """
+        self._sentence_faker.add_provider_alias(
+            provider_name=provider_name, new_name=new_name
+        )
+
+    def add_entity_type_mapping(
+        self, input_entity_type: str, output_entity_type: str
+    ) -> None:
+        self._entity_type_mapping[input_entity_type] = output_entity_type
+
+    @staticmethod
+    def _preprocess_template(template: str):
+        """Lowercase the entity names within double curly braces in the template, and replace < and > with {{ and }}."""  # noqa: E501
+
+        def lowercase_within_braces(s):
+            return re.sub(
+                r"{{(.*?)}}", lambda match: f"{{{{{match.group(1).lower()}}}}}", s
+            )
+
+        template = template.replace("<", "{{").replace(">", "}}")
+        template = lowercase_within_braces(template)
+
+        return template
+
 
 if __name__ == "__main__":
     sentence_faker = PresidioSentenceFaker(
diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py
index 9e52601..faaa944 100644
--- a/presidio_evaluator/data_objects.py
+++ b/presidio_evaluator/data_objects.py
@@ -180,6 +180,12 @@ def from_json(cls, data, **kwargs):
         return cls(**data, create_tags_from_span=True, **kwargs)
 
     def get_tags(self, scheme="IOB", model_version="en_core_web_sm"):
+        """Extract the tokens and tags from the spans.
+
+        :param scheme: IO, BIO or BILUO
+        :param model_version: The name of the spaCy model to use for tokenization
+        """
+
         start_indices = [span.start_position for span in self.spans]
         end_indices = [span.end_position for span in self.spans]
         tags = [span.entity_type for span in self.spans]
@@ -192,19 +198,25 @@ def get_tags(self, scheme="IOB", model_version="en_core_web_sm"):
             starts=start_indices,
             ends=end_indices,
             tokens=tokens,
+            token_model_version=model_version
         )
 
         return tokens, labels
 
-    def to_conll(self, translate_tags: bool) -> List[Dict[str, Any]]:
+    def to_conll(self, translate_tags: bool, tokenizer: str) -> List[Dict[str, Any]]:
         """
         Turns a list of InputSample objects to a dictionary
         containing text, pos, tag, template_id and label.
         :param translate_tags: Whether to translate tags using the PRESIDIO_SPACY_ENTITIES dictionary
+        :param tokenizer: The name of the spaCy model to use for tokenization
         :return: Dict
         """
 
         conll = []
+
+        if len(self.tokens) == 0:
+            self.tokens, self.tags = self.get_tags(model_version=tokenizer)
+
         for i, token in enumerate(self.tokens):
             if translate_tags:
                 label = self.translate_tag(
@@ -233,7 +245,7 @@ def create_conll_dataset(
         dataset: List["InputSample"],
         translate_tags=False,
         to_bio=True,
-        token_model_version="en_core_web_sm",
+        tokenizer:str="en_core_web_sm",
     ) -> pd.DataFrame:
         if len(dataset) <= 1:
             raise ValueError("Dataset should contain multiple records")
@@ -243,7 +255,8 @@ def create_conll_dataset(
         for sample in tqdm(dataset):
             if to_bio:
                 sample.biluo_to_bio()
-            conll = sample.to_conll(translate_tags=translate_tags)
+            conll = sample.to_conll(translate_tags=translate_tags,
+                                    tokenizer=tokenizer)
             for token in conll:
                 token["sentence"] = i
                 conlls.append(token)
diff --git a/tests/test_presidio_sentence_faker.py b/tests/test_presidio_sentence_faker.py
index 206d5fd..7eb699b 100644
--- a/tests/test_presidio_sentence_faker.py
+++ b/tests/test_presidio_sentence_faker.py
@@ -24,13 +24,13 @@ def test_generate_new_fake_sentences(num_sentences: int):
 
     expected_providers = deepcopy(default_faker_providers)
     expected_providers.extend(presidio_providers)
-    expected_providers.extend([standard_faker.__getattr__(key)
-                               for key in PresidioSentenceFaker.PROVIDER_ALIASES.keys()])
+    expected_providers.extend([standard_faker.__getattr__(alias[0])
+                               for alias in PresidioSentenceFaker.PROVIDER_ALIASES])
     actual_providers = sentence_faker._sentence_faker.providers
     num_aliases = len(PresidioSentenceFaker.PROVIDER_ALIASES)
     actual_num_providers = len(actual_providers)
-    expected_aliases = set(getattr(standard_faker, provider_name)
-                           for provider_name in PresidioSentenceFaker.PROVIDER_ALIASES.keys())
+    expected_aliases = set(getattr(standard_faker, provider_name[0])
+                           for provider_name in PresidioSentenceFaker.PROVIDER_ALIASES)
     assert actual_num_providers == len(expected_providers), \
         f'Expected {len(presidio_providers)} presidio providers to be used and {num_aliases} aliases. ' \
         f'Faker has been extended with {actual_num_providers - len(default_faker_providers)} providers/aliases. ' \

From aa41f10be324e124f7b6acc383813dca33d11ae4 Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:48:13 +0200
Subject: [PATCH 2/9] updated package version to 0.2.1

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 99cd13a..eca721e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "presidio_evaluator"
-version = "0.2.0"
+version = "0.2.1"
 description = ""
 authors = ["Microsoft"]
 readme = "README.md"

From 5443db19d460854aaf86d0539947b3e0223d4473 Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:51:58 +0200
Subject: [PATCH 3/9] removed CI for py3.8 and added 3.12

---
 azure-pipelines.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 71e23b2..eaf304c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -8,14 +8,14 @@ pool:
   vmImage: 'ubuntu-latest'
 strategy:
   matrix:
-    Python38:
-      python.version: '3.8'
     Python39:
       python.version: '3.9'
     Python310:
       python.version: '3.10'
     Python311:
       python.version: '3.11'
+    Python312:
+      python.version: '3.12'
 steps:
 - task: UsePythonVersion@0
   inputs:

From 3bdede71b157b446c51b9a1716452828784a199d Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:24:37 +0200
Subject: [PATCH 4/9] small changes to data generation notebook

---
 notebooks/1_Generate_data.ipynb               | 278 +++++++-----------
 .../data_generator/presidio_sentence_faker.py |  13 +-
 2 files changed, 120 insertions(+), 171 deletions(-)

diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
index d5b59e8..fe291ff 100644
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@@ -16,12 +16,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "is_executing": true,
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.9/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "import datetime\n",
     "import pprint\n",
@@ -69,26 +78,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "is_executing": true
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using default entity providers\n",
+      "Using default entity mapping between the entities                   in the templates and the ones in the output dataset\n",
+      "Using default provider aliases\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Sampling: 100%|██████████| 10/10 [00:00<00:00, 12706.16it/s]"
+      "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3407.23it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Non-mapped entity type found: country Non-mapped entities will be mapped to COUNTRY in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n",
-      "Non-mapped entity type found: name Non-mapped entities will be mapped to NAME in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n",
-      "Non-mapped entity type found: address Non-mapped entities will be mapped to ADDRESS in the output dataset. If you prefer a different mapping, pass the `entity_type_mapping` argument with a mapping for this entity type.\n",
-      "I just moved to {{city}} from {{COUNTRY}}\n",
+      "I just moved to {{GPE}} from {{GPE}}\n",
       "[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]\n"
      ]
     },
@@ -108,7 +123,9 @@
     "]\n",
     "\n",
     "\n",
-    "sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05, sentence_templates=sentence_templates)\n",
+    "sentence_faker = PresidioSentenceFaker('en_US', \n",
+    "                                       lower_case_ratio=0.05, \n",
+    "                                       sentence_templates=sentence_templates)\n",
     "fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)\n",
     "\n",
     "# Print the spans of the first sample\n",
@@ -128,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "metadata": {
     "is_executing": true,
     "scrolled": true
@@ -163,16 +180,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using default entity providers\n",
+      "Using default entity mapping between the entities                   in the templates and the ones in the output dataset\n",
+      "Using default provider aliases\n"
+     ]
+    }
+   ],
    "source": [
     "sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -259,8 +286,8 @@
        "      <td>...</td>\n",
        "      <td>Formula Gray</td>\n",
        "      <td>LostMillions.com.pt</td>\n",
-       "      <td>Patricia Desrosiers</td>\n",
-       "      <td>Patricia Desrosiers</td>\n",
+       "      <td>Patricia G. Desrosiers</td>\n",
+       "      <td>Patricia G. Desrosiers</td>\n",
        "      <td>Patricia</td>\n",
        "      <td></td>\n",
        "      <td>Ms.</td>\n",
@@ -360,12 +387,12 @@
        "3  183 Epimenidou Street       Limassol         LI  ...            Quickbiz   \n",
        "4           Karu põik 61          Pärnu         PR  ...  Dubrow's Cafeteria   \n",
        "\n",
-       "           domain_name               person                 name  \\\n",
-       "0   MarathonDancing.gl       Marie Hamanová       Marie Hamanová   \n",
-       "1  LostMillions.com.pt  Patricia Desrosiers  Patricia Desrosiers   \n",
-       "2     MediumTube.co.za           Debra Neal           Debra Neal   \n",
-       "3   ImproveLook.com.cy      Peverell Racine      Peverell Racine   \n",
-       "4       PostTan.com.ee      Iolanda Tratnik      Iolanda Tratnik   \n",
+       "           domain_name                  person                    name  \\\n",
+       "0   MarathonDancing.gl          Marie Hamanová          Marie Hamanová   \n",
+       "1  LostMillions.com.pt  Patricia G. Desrosiers  Patricia G. Desrosiers   \n",
+       "2     MediumTube.co.za              Debra Neal              Debra Neal   \n",
+       "3   ImproveLook.com.cy         Peverell Racine         Peverell Racine   \n",
+       "4       PostTan.com.ee         Iolanda Tratnik         Iolanda Tratnik   \n",
        "\n",
        "  first_name_female first_name_male prefix_female prefix_male  \\\n",
        "0             Marie                          Mrs.               \n",
@@ -384,7 +411,7 @@
        "[5 rows x 37 columns]"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -398,12 +425,38 @@
    "metadata": {},
    "source": [
     "`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.\n",
-    "These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`"
+    "These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`\n",
+    "\n",
+    "It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, \n",
+    "and calling `add_provider` on the `PresidioSentenceFaker` instance.\n",
+    "For example:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "from faker.providers import BaseProvider\n",
+    "\n",
+    "class MarsIdProvider(BaseProvider):\n",
+    "    def mars_id(self):\n",
+    "        # Generate a random row number between 1 and 50\n",
+    "        row = random.randint(1, 50)\n",
+    "        # Generate a random letter for the seat location from A-K\n",
+    "        location = random.choice('ABCDEFGHIJK')\n",
+    "        # Return the seat in the format \"row-letter\" (e.g., \"25A\")\n",
+    "        return f\"{row}{location}\"\n",
+    "\n",
+    "sentence_faker.add_provider(MarsIdProvider)\n",
+    "# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
    "metadata": {
     "is_executing": true
    },
@@ -414,7 +467,7 @@
        "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -446,7 +499,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 15,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -462,7 +515,7 @@
        " ('date_of_birth', 'birthday')]"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -488,14 +541,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 8521.17it/s]"
+      "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 7794.60it/s]"
      ]
     },
     {
@@ -529,7 +582,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 17,
    "metadata": {
     "is_executing": true,
     "scrolled": true
@@ -568,7 +621,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 18,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -598,7 +651,7 @@
        "         'US_DRIVER_LICENSE': 4})"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -613,7 +666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -623,8 +676,8 @@
       "Full text: The bus station is on Via Pasquale Scura 127\n",
       "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n",
       "\n",
-      "Full text: Leigha C Mackay\\n\\nLa Sagne\\nSwitzerland\n",
-      "Spans: [Span(type: country, value: Switzerland, char_span: [29: 40]), Span(type: city, value: La Sagne, char_span: [19: 27]), Span(type: name, value: Leigha C Mackay, char_span: [0: 15])]\n",
+      "Full text: Leigha Mackay\\n\\nLa Sagne\\nSwitzerland\n",
+      "Spans: [Span(type: country, value: Switzerland, char_span: [27: 38]), Span(type: city, value: La Sagne, char_span: [17: 25]), Span(type: name, value: Leigha Mackay, char_span: [0: 13])]\n",
       "\n",
       "Full text: Can someone call me on 06-82237745? I have some questions about opening an account.\n",
       "Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]\n",
@@ -635,8 +688,8 @@
       "Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia\n",
       "Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, value: 254 Damvergi Street, char_span: [41: 60]), Span(type: building_number, value: 083, char_span: [37: 40]), Span(type: first_name, value: Csanád, char_span: [17: 23]), Span(type: first_name_male, value: Csanád, char_span: [0: 6])]\n",
       "\n",
-      "Full text: You can tell Cecilie was a huge Cecilie K Josefsen fan. Written when he was 21.\n",
-      "Spans: [Span(type: age, value: 21, char_span: [76: 78]), Span(type: person, value: cecilie k josefsen, char_span: [32: 50]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n",
+      "Full text: You can tell Cecilie was a huge Cecilie Josefsen fan. Written when he was 21.\n",
+      "Spans: [Span(type: age, value: 21, char_span: [74: 76]), Span(type: person, value: cecilie josefsen, char_span: [32: 48]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n",
       "\n",
       "Full text: Who's coming to Switzerland with me?\n",
       "Spans: [Span(type: country, value: Switzerland, char_span: [16: 27])]\n",
@@ -671,7 +724,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 20,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -685,20 +738,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'../data/generated_size_1500_date_January_06_2025.json'"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "output_file"
    ]
@@ -716,7 +758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 23,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -728,7 +770,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1500/1500 [00:00<00:00, 35869.80it/s]\n"
+      "100%|██████████| 1500/1500 [00:00<00:00, 47248.41it/s]\n"
      ]
     },
     {
@@ -851,137 +893,37 @@
        "      <td>B-name</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>C</td>\n",
-       "      <td>PROPN</td>\n",
-       "      <td>NNP</td>\n",
-       "      <td>189</td>\n",
-       "      <td>I-name</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>Mackay\\n\\nLa</td>\n",
-       "      <td>NOUN</td>\n",
-       "      <td>NN</td>\n",
-       "      <td>189</td>\n",
-       "      <td>I-name</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>Sagne\\nSwitzerland</td>\n",
-       "      <td>PROPN</td>\n",
-       "      <td>NNP</td>\n",
-       "      <td>189</td>\n",
-       "      <td>B-city</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>Can</td>\n",
-       "      <td>AUX</td>\n",
-       "      <td>MD</td>\n",
-       "      <td>57</td>\n",
-       "      <td>O</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>someone</td>\n",
-       "      <td>PRON</td>\n",
-       "      <td>NN</td>\n",
-       "      <td>57</td>\n",
-       "      <td>O</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>call</td>\n",
-       "      <td>VERB</td>\n",
-       "      <td>VB</td>\n",
-       "      <td>57</td>\n",
-       "      <td>O</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>me</td>\n",
-       "      <td>PRON</td>\n",
-       "      <td>PRP</td>\n",
-       "      <td>57</td>\n",
-       "      <td>O</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>on</td>\n",
-       "      <td>ADP</td>\n",
-       "      <td>IN</td>\n",
-       "      <td>57</td>\n",
-       "      <td>O</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>06</td>\n",
-       "      <td>NUM</td>\n",
-       "      <td>CD</td>\n",
-       "      <td>57</td>\n",
-       "      <td>B-phone_number</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>-</td>\n",
-       "      <td>SYM</td>\n",
-       "      <td>SYM</td>\n",
-       "      <td>57</td>\n",
-       "      <td>I-phone_number</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                  text    pos  tag  template_id           label  sentence\n",
-       "0                  The    DET   DT          163               O         0\n",
-       "1                  bus   NOUN   NN          163               O         0\n",
-       "2              station   NOUN   NN          163               O         0\n",
-       "3                   is    AUX  VBZ          163               O         0\n",
-       "4                   on    ADP   IN          163               O         0\n",
-       "5                  Via  PROPN  NNP          163   B-street_name         0\n",
-       "6             Pasquale  PROPN  NNP          163   I-street_name         0\n",
-       "7                Scura  PROPN  NNP          163   I-street_name         0\n",
-       "8                  127    NUM   CD          163   I-street_name         0\n",
-       "9               Leigha   VERB   VB          189          B-name         1\n",
-       "10                   C  PROPN  NNP          189          I-name         1\n",
-       "11        Mackay\\n\\nLa   NOUN   NN          189          I-name         1\n",
-       "12  Sagne\\nSwitzerland  PROPN  NNP          189          B-city         1\n",
-       "13                 Can    AUX   MD           57               O         2\n",
-       "14             someone   PRON   NN           57               O         2\n",
-       "15                call   VERB   VB           57               O         2\n",
-       "16                  me   PRON  PRP           57               O         2\n",
-       "17                  on    ADP   IN           57               O         2\n",
-       "18                  06    NUM   CD           57  B-phone_number         2\n",
-       "19                   -    SYM  SYM           57  I-phone_number         2"
+       "       text    pos  tag  template_id          label  sentence\n",
+       "0       The    DET   DT          163              O         0\n",
+       "1       bus   NOUN   NN          163              O         0\n",
+       "2   station   NOUN   NN          163              O         0\n",
+       "3        is    AUX  VBZ          163              O         0\n",
+       "4        on    ADP   IN          163              O         0\n",
+       "5       Via  PROPN  NNP          163  B-street_name         0\n",
+       "6  Pasquale  PROPN  NNP          163  I-street_name         0\n",
+       "7     Scura  PROPN  NNP          163  I-street_name         0\n",
+       "8       127    NUM   CD          163  I-street_name         0\n",
+       "9    Leigha   VERB   VB          189         B-name         1"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "conll = InputSample.create_conll_dataset(dataset=fake_records)\n",
-    "conll.head(20)"
+    "conll.head(10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 24,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -1008,7 +950,7 @@
    "source": [
     "### Next steps\n",
     "\n",
-    "- Evaluate Presidio using this fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
+    "- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
     "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)\n",
     "- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)"
    ]
diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py
index ec83332..f5355e0 100644
--- a/presidio_evaluator/data_generator/presidio_sentence_faker.py
+++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py
@@ -143,8 +143,8 @@ def __init__(
 
         if not entity_type_mapping:
             print(
-                "Using default entity mapping between the entities \
-                  in the templates and the ones in the output dataset"
+                "Using default entity mapping between the entities "
+                "in the templates and the ones in the output dataset"
             )
             entity_type_mapping = self.ENTITY_TYPE_MAPPING
 
@@ -176,7 +176,7 @@ def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]:
                 else:
                     # Otherwise, capitalize the entity type and add to the mapping
                     print(
-                        f"Warning: Non-mapped entity type found: {span.type} "
+                        f"Warning: Non-mapped entity type found: {span.type}. "
                         f"Non-mapped entities will be mapped to {span.type.upper()} "
                         f"in the output dataset. If you prefer a different mapping, "
                         f"pass the `entity_type_mapping` argument with a mapping for this entity type."
@@ -196,6 +196,13 @@ def seed(self, seed_value=42) -> None:
         random.seed(seed_value)
         np.random.seed(seed_value)
 
+    def add_provider(self, provider:BaseProvider) ->None:
+        """
+        Add a provider to the sentence faker
+        :param provider: A faker provider inheriting from BaseProvider
+        """
+        self._sentence_faker.add_provider(provider)
+
     def add_provider_alias(self, provider_name: str, new_name: str) -> None:
         """
         Adds a copy of a provider, with a different name

From 21cc2292d36507494c08db1022f87ccc374181ed Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:33:08 +0200
Subject: [PATCH 5/9] minor updates

---
 .../data_generator/faker_extensions/sentences.py          | 8 +++-----
 .../data_generator/presidio_sentence_faker.py             | 2 +-
 presidio_evaluator/data_objects.py                        | 8 ++++++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py
index 63753b2..c490f91 100644
--- a/presidio_evaluator/data_generator/faker_extensions/sentences.py
+++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py
@@ -195,11 +195,9 @@ def parse(
             return fake_pattern
         except Exception as err:
             raise AttributeError(
-                f'Failed to generate fake data based on template "{template}".'
-                f"You might need to add a new Faker provider "
-                f"or create an alias (map the entity name to one " 
-                f"of the existing providers)."
-                f"{err}"
+                f'Failed to generate fake data based on template "{template}". '
+                f'Add a new Faker provider or create an alias '
+                f'for the entity name. {err}'
             )
 
     @staticmethod
diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py
index f5355e0..009cb37 100644
--- a/presidio_evaluator/data_generator/presidio_sentence_faker.py
+++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py
@@ -190,7 +190,7 @@ def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]:
         return self.fake_sentence_results
 
     @staticmethod
-    def seed(self, seed_value=42) -> None:
+    def seed(seed_value=42) -> None:
         """Seed the faker and random modules for reproducibility."""
         Faker.seed(seed_value)
         random.seed(seed_value)
diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py
index faaa944..33fb842 100644
--- a/presidio_evaluator/data_objects.py
+++ b/presidio_evaluator/data_objects.py
@@ -179,7 +179,9 @@ def from_json(cls, data, **kwargs):
             data["spans"] = [Span.from_json(span) for span in data["spans"]]
         return cls(**data, create_tags_from_span=True, **kwargs)
 
-    def get_tags(self, scheme="IOB", model_version="en_core_web_sm"):
+    def get_tags(self,
+                 scheme:str="IOB",
+                 model_version:str="en_core_web_sm"):
         """Extract the tokens and tags from the spans.
 
         :param scheme: IO, BIO or BILUO
@@ -203,7 +205,9 @@ def get_tags(self, scheme="IOB", model_version="en_core_web_sm"):
 
         return tokens, labels
 
-    def to_conll(self, translate_tags: bool, tokenizer: str) -> List[Dict[str, Any]]:
+    def to_conll(self,
+                 translate_tags: bool,
+                 tokenizer: str="en_core_web_sm") -> List[Dict[str, Any]]:
         """
         Turns a list of InputSample objects to a dictionary
         containing text, pos, tag, template_id and label.

From b57cc70f446ab99a1fba726a71e911902f755c1c Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:35:32 +0200
Subject: [PATCH 6/9] formatting

---
 .../faker_extensions/sentences.py               |  4 ++--
 .../data_generator/presidio_sentence_faker.py   |  2 +-
 presidio_evaluator/data_objects.py              | 17 +++++++----------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/presidio_evaluator/data_generator/faker_extensions/sentences.py b/presidio_evaluator/data_generator/faker_extensions/sentences.py
index c490f91..087052c 100644
--- a/presidio_evaluator/data_generator/faker_extensions/sentences.py
+++ b/presidio_evaluator/data_generator/faker_extensions/sentences.py
@@ -196,8 +196,8 @@ def parse(
         except Exception as err:
             raise AttributeError(
                 f'Failed to generate fake data based on template "{template}". '
-                f'Add a new Faker provider or create an alias '
-                f'for the entity name. {err}'
+                f"Add a new Faker provider or create an alias "
+                f"for the entity name. {err}"
             )
 
     @staticmethod
diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py
index 009cb37..abc3205 100644
--- a/presidio_evaluator/data_generator/presidio_sentence_faker.py
+++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py
@@ -196,7 +196,7 @@ def seed(seed_value=42) -> None:
         random.seed(seed_value)
         np.random.seed(seed_value)
 
-    def add_provider(self, provider:BaseProvider) ->None:
+    def add_provider(self, provider: BaseProvider) -> None:
         """
         Add a provider to the sentence faker
         :param provider: A faker provider inheriting from BaseProvider
diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py
index 33fb842..0dd7d2f 100644
--- a/presidio_evaluator/data_objects.py
+++ b/presidio_evaluator/data_objects.py
@@ -179,9 +179,7 @@ def from_json(cls, data, **kwargs):
             data["spans"] = [Span.from_json(span) for span in data["spans"]]
         return cls(**data, create_tags_from_span=True, **kwargs)
 
-    def get_tags(self,
-                 scheme:str="IOB",
-                 model_version:str="en_core_web_sm"):
+    def get_tags(self, scheme: str = "IOB", model_version: str = "en_core_web_sm"):
         """Extract the tokens and tags from the spans.
 
         :param scheme: IO, BIO or BILUO
@@ -200,14 +198,14 @@ def get_tags(self,
             starts=start_indices,
             ends=end_indices,
             tokens=tokens,
-            token_model_version=model_version
+            token_model_version=model_version,
         )
 
         return tokens, labels
 
-    def to_conll(self,
-                 translate_tags: bool,
-                 tokenizer: str="en_core_web_sm") -> List[Dict[str, Any]]:
+    def to_conll(
+        self, translate_tags: bool, tokenizer: str = "en_core_web_sm"
+    ) -> List[Dict[str, Any]]:
         """
         Turns a list of InputSample objects to a dictionary
         containing text, pos, tag, template_id and label.
@@ -249,7 +247,7 @@ def create_conll_dataset(
         dataset: List["InputSample"],
         translate_tags=False,
         to_bio=True,
-        tokenizer:str="en_core_web_sm",
+        tokenizer: str = "en_core_web_sm",
     ) -> pd.DataFrame:
         if len(dataset) <= 1:
             raise ValueError("Dataset should contain multiple records")
@@ -259,8 +257,7 @@ def create_conll_dataset(
         for sample in tqdm(dataset):
             if to_bio:
                 sample.biluo_to_bio()
-            conll = sample.to_conll(translate_tags=translate_tags,
-                                    tokenizer=tokenizer)
+            conll = sample.to_conll(translate_tags=translate_tags, tokenizer=tokenizer)
             for token in conll:
                 token["sentence"] = i
                 conlls.append(token)

From f18d5f4c38cea6345aa403000527a87772b7d0c1 Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:36:38 +0200
Subject: [PATCH 7/9] removed warning

---
 notebooks/1_Generate_data.ipynb | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
index fe291ff..d6c7596 100644
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@@ -25,10 +25,7 @@
     {
      "name": "stderr",
      "output_type": "stream",
-     "text": [
-      "/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.9/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
-      "  warnings.warn(\n"
-     ]
+     "text": []
     }
    ],
    "source": [

From d8a7de39150bad00114195c2e9383556a4a7c89b Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:44:24 +0200
Subject: [PATCH 8/9] added tests to template preprocessing

---
 tests/test_presidio_sentence_faker.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_presidio_sentence_faker.py b/tests/test_presidio_sentence_faker.py
index 7eb699b..34d2d9e 100644
--- a/tests/test_presidio_sentence_faker.py
+++ b/tests/test_presidio_sentence_faker.py
@@ -43,3 +43,14 @@ def test_generate_new_fake_sentences(num_sentences: int):
         assert fake_sentence_result.full_text
         assert fake_sentence_result.masked
         assert fake_sentence_result.template_id >= 0
+
+
+@pytest.mark.parametrize("template_before, template_after", [
+    ("I just moved to {{CiTY}} from {{Country}}",
+    "I just moved to {{city}} from {{country}}"),
+    ("I just moved to <city> from <country>.",
+    "I just moved to {{city}} from {{country}}.")
+])
+def test_preprocess_template(template_before: str, template_after: str):
+    sentence_faker = PresidioSentenceFaker(locale='en', lower_case_ratio=0)
+    assert sentence_faker._preprocess_template(template_before) == template_after

From 58f7f28672d731aa53c69c6e22bf8f7300b1593b Mon Sep 17 00:00:00 2001
From: Omri Mendels <omri374@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:47:27 +0200
Subject: [PATCH 9/9] re-run notebook

---
 notebooks/1_Generate_data.ipynb | 305 ++++++++++++++++++--------------
 1 file changed, 169 insertions(+), 136 deletions(-)

diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
index d6c7596..78da83c 100644
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@@ -16,18 +16,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "is_executing": true,
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": []
-    }
-   ],
+   "outputs": [],
    "source": [
     "import datetime\n",
     "import pprint\n",
@@ -85,7 +79,7 @@
      "output_type": "stream",
      "text": [
       "Using default entity providers\n",
-      "Using default entity mapping between the entities                   in the templates and the ones in the output dataset\n",
+      "Using default entity mapping between the entities in the templates and the ones in the output dataset\n",
       "Using default provider aliases\n"
      ]
     },
@@ -93,15 +87,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3407.23it/s]"
+      "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3959.88it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "I just moved to {{GPE}} from {{GPE}}\n",
-      "[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]\n"
+      "Please send it to {{STREET_ADDRESS}}\n",
+      "[Span(type: address, value: the corner of Καλαμπάκα 33 and Stefan Land, char_span: [18: 60])]\n"
      ]
     },
     {
@@ -142,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {
     "is_executing": true,
     "scrolled": true
@@ -177,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -185,7 +179,7 @@
      "output_type": "stream",
      "text": [
       "Using default entity providers\n",
-      "Using default entity mapping between the entities                   in the templates and the ones in the output dataset\n",
+      "Using default entity mapping between the entities in the templates and the ones in the output dataset\n",
       "Using default provider aliases\n"
      ]
     }
@@ -196,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -307,8 +301,8 @@
        "      <td>...</td>\n",
        "      <td>Dahlkemper's</td>\n",
        "      <td>MediumTube.co.za</td>\n",
-       "      <td>Debra Neal</td>\n",
-       "      <td>Debra Neal</td>\n",
+       "      <td>Debra O. Neal</td>\n",
+       "      <td>Debra O. Neal</td>\n",
        "      <td>Debra</td>\n",
        "      <td></td>\n",
        "      <td>Ms.</td>\n",
@@ -331,8 +325,8 @@
        "      <td>...</td>\n",
        "      <td>Quickbiz</td>\n",
        "      <td>ImproveLook.com.cy</td>\n",
-       "      <td>Peverell Racine</td>\n",
-       "      <td>Peverell Racine</td>\n",
+       "      <td>Peverell C. Racine</td>\n",
+       "      <td>Peverell C. Racine</td>\n",
        "      <td></td>\n",
        "      <td>Peverell</td>\n",
        "      <td></td>\n",
@@ -355,8 +349,8 @@
        "      <td>...</td>\n",
        "      <td>Dubrow's Cafeteria</td>\n",
        "      <td>PostTan.com.ee</td>\n",
-       "      <td>Iolanda Tratnik</td>\n",
-       "      <td>Iolanda Tratnik</td>\n",
+       "      <td>Iolanda S. Tratnik</td>\n",
+       "      <td>Iolanda S. Tratnik</td>\n",
        "      <td>Iolanda</td>\n",
        "      <td></td>\n",
        "      <td>Mrs.</td>\n",
@@ -387,9 +381,9 @@
        "           domain_name                  person                    name  \\\n",
        "0   MarathonDancing.gl          Marie Hamanová          Marie Hamanová   \n",
        "1  LostMillions.com.pt  Patricia G. Desrosiers  Patricia G. Desrosiers   \n",
-       "2     MediumTube.co.za              Debra Neal              Debra Neal   \n",
-       "3   ImproveLook.com.cy         Peverell Racine         Peverell Racine   \n",
-       "4       PostTan.com.ee         Iolanda Tratnik         Iolanda Tratnik   \n",
+       "2     MediumTube.co.za           Debra O. Neal           Debra O. Neal   \n",
+       "3   ImproveLook.com.cy      Peverell C. Racine      Peverell C. Racine   \n",
+       "4       PostTan.com.ee      Iolanda S. Tratnik      Iolanda S. Tratnik   \n",
        "\n",
        "  first_name_female first_name_male prefix_female prefix_male  \\\n",
        "0             Marie                          Mrs.               \n",
@@ -408,7 +402,7 @@
        "[5 rows x 37 columns]"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -431,7 +425,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -453,7 +447,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
    "metadata": {
     "is_executing": true
    },
@@ -464,7 +458,7 @@
        "presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -496,7 +490,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 9,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -512,7 +506,7 @@
        " ('date_of_birth', 'birthday')]"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -538,22 +532,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 7794.60it/s]"
+      "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 13821.21it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Full text: The bus station is on Via Pasquale Scura 127\n",
-      "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n",
+      "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n",
+      "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n",
       "\n"
      ]
     },
@@ -579,7 +573,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 11,
    "metadata": {
     "is_executing": true,
     "scrolled": true
@@ -592,7 +586,7 @@
       "Total: 1500\n",
       "Avg # of records per template: 7.142857142857143\n",
       "Median # of records per template: 7.0\n",
-      "Std: 2.4394713378441786\n"
+      "Std: 2.6812526263406258\n"
      ]
     }
    ],
@@ -618,7 +612,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 12,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -629,26 +623,26 @@
     {
      "data": {
       "text/plain": [
-       "Counter({'PERSON': 895,\n",
-       "         'STREET_ADDRESS': 571,\n",
-       "         'GPE': 375,\n",
-       "         'ORGANIZATION': 277,\n",
-       "         'PHONE_NUMBER': 124,\n",
-       "         'CREDIT_CARD': 115,\n",
-       "         'DATE_TIME': 110,\n",
-       "         'AGE': 77,\n",
-       "         'TITLE': 71,\n",
-       "         'NRP': 67,\n",
-       "         'EMAIL_ADDRESS': 38,\n",
-       "         'DOMAIN_NAME': 31,\n",
-       "         'ZIP_CODE': 25,\n",
-       "         'IP_ADDRESS': 17,\n",
-       "         'US_SSN': 15,\n",
-       "         'IBAN_CODE': 12,\n",
-       "         'US_DRIVER_LICENSE': 4})"
+       "Counter({'PERSON': 874,\n",
+       "         'STREET_ADDRESS': 609,\n",
+       "         'GPE': 442,\n",
+       "         'ORGANIZATION': 253,\n",
+       "         'CREDIT_CARD': 131,\n",
+       "         'PHONE_NUMBER': 117,\n",
+       "         'DATE_TIME': 106,\n",
+       "         'TITLE': 91,\n",
+       "         'AGE': 79,\n",
+       "         'NRP': 66,\n",
+       "         'ZIP_CODE': 42,\n",
+       "         'EMAIL_ADDRESS': 33,\n",
+       "         'DOMAIN_NAME': 30,\n",
+       "         'IBAN_CODE': 26,\n",
+       "         'IP_ADDRESS': 18,\n",
+       "         'US_SSN': 18,\n",
+       "         'US_DRIVER_LICENSE': 9})"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -663,41 +657,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Full text: The bus station is on Via Pasquale Scura 127\n",
-      "Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]\n",
+      "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n",
+      "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n",
       "\n",
-      "Full text: Leigha Mackay\\n\\nLa Sagne\\nSwitzerland\n",
-      "Spans: [Span(type: country, value: Switzerland, char_span: [27: 38]), Span(type: city, value: La Sagne, char_span: [17: 25]), Span(type: name, value: Leigha Mackay, char_span: [0: 13])]\n",
+      "Full text: The Ilta T Ryhänen version recorded for Weatherford International Inc became the first celebrity recording by a classical musician to sell one million copies. The song was awarded the seventh gold disc ever granted.\n",
+      "Spans: [Span(type: organization, value: Weatherford International Inc, char_span: [40: 69]), Span(type: person, value: Ilta T Ryhänen, char_span: [4: 18])]\n",
       "\n",
-      "Full text: Can someone call me on 06-82237745? I have some questions about opening an account.\n",
-      "Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]\n",
+      "Full text: We'll meet Monday at JAPAN PULP AND PAPER COMPANY LIMITED, 5931 84 Cassinia Street, GUNDAGAI\n",
+      "Spans: [Span(type: city, value: GUNDAGAI, char_span: [84: 92]), Span(type: street_name, value: 84 Cassinia Street, char_span: [64: 82]), Span(type: building_number, value: 5931, char_span: [59: 63]), Span(type: organization, value: JAPAN PULP AND PAPER COMPANY LIMITED, char_span: [21: 57]), Span(type: day_of_week, value: Monday, char_span: [11: 17])]\n",
       "\n",
-      "Full text: Could you please send me the last billed amount for cc 4218196001337 on my e-mail TomaszJablonski@gustr.com?\n",
-      "Spans: [Span(type: email, value: TomaszJablonski@gustr.com, char_span: [82: 107]), Span(type: credit_card_number, value: 4218196001337, char_span: [55: 68])]\n",
+      "Full text: Can someone call me on 0377 7151585? I have some questions about opening an account.\n",
+      "Spans: [Span(type: phone_number, value: 0377 7151585, char_span: [23: 35])]\n",
       "\n",
-      "Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia\n",
-      "Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, value: 254 Damvergi Street, char_span: [41: 60]), Span(type: building_number, value: 083, char_span: [37: 40]), Span(type: first_name, value: Csanád, char_span: [17: 23]), Span(type: first_name_male, value: Csanád, char_span: [0: 6])]\n",
+      "Full text: Leena R Filppula\\nTelephone and Data Systems Inc.\\nServidão Fernando Albrecht 673 Szemere Radial\n",
+      " Suite 538\n",
+      " Joinville\n",
+      " Brazil 27518\\n032 627 37 30 office\\n(07700)331659 fax\\n+41 47 717 21 68 mobile\\n\n",
+      "Spans: [Span(type: phone_number, value: +41 47 717 21 68, char_span: [175: 191]), Span(type: phone_number, value: (07700)331659, char_span: [156: 169]), Span(type: phone_number, value: 032 627 37 30, char_span: [134: 147]), Span(type: address, value: Servidão Fernando Albrecht 673 Szemere Radial\n",
+      " Suite 538\n",
+      " Joinville\n",
+      " Brazil 27518, char_span: [51: 132]), Span(type: organization, value: Telephone and Data Systems Inc., char_span: [18: 49]), Span(type: name, value: Leena R Filppula, char_span: [0: 16])]\n",
       "\n",
-      "Full text: You can tell Cecilie was a huge Cecilie Josefsen fan. Written when he was 21.\n",
-      "Spans: [Span(type: age, value: 21, char_span: [74: 76]), Span(type: person, value: cecilie josefsen, char_span: [32: 48]), Span(type: first_name, value: cecilie, char_span: [13: 20])]\n",
+      "Full text: Bot: Where would you like this to be sent to? User: 11129 Rua Forno 76\n",
+      " Suite 599\n",
+      " Quinta do Passadouro de Cima\n",
+      " Portugal 66984\n",
+      "Spans: [Span(type: address, value: 11129 Rua Forno 76\n",
+      " Suite 599\n",
+      " Quinta do Passadouro de Cima\n",
+      " Portugal 66984, char_span: [52: 127])]\n",
       "\n",
-      "Full text: Who's coming to Switzerland with me?\n",
-      "Spans: [Span(type: country, value: Switzerland, char_span: [16: 27])]\n",
+      "Full text: One of the most depressing songs on the list. He's injured from the waist down from Spain, but Alexander just has to get laid. Don't go to town, Christopher!\n",
+      "Spans: [Span(type: first_name, value: Christopher, char_span: [145: 156]), Span(type: first_name, value: Alexander, char_span: [95: 104]), Span(type: country, value: Spain, char_span: [84: 89])]\n",
       "\n",
-      "Full text: Helena Carlsen\\n\\n637 Strojírenská 1006\\n Suite 026\\n Svratka\\n Czech Republic 45098\n",
-      "Spans: [Span(type: postcode, value: 45098, char_span: [79: 84]), Span(type: country, value: Czech Republic, char_span: [64: 78]), Span(type: city, value: Svratka, char_span: [54: 61]), Span(type: secondary_address, value: Suite 026, char_span: [42: 51]), Span(type: street_name, value: Strojírenská 1006, char_span: [22: 39]), Span(type: building_number, value: 637, char_span: [18: 21]), Span(type: person, value: Helena Carlsen, char_span: [0: 14])]\n",
+      "Full text: Our offices are located at Romina and Müürivahe 27\n",
+      "Spans: [Span(type: address, value: Romina and Müürivahe 27, char_span: [27: 50])]\n",
       "\n",
-      "Full text: Francesca Freeman\\n\\n35116 Rua Arapiraca 1943\\n Apt. 559\\n Teixeira de Freitas\\n Brazil 35172\\n(73) 4746-3459-Office\\,781-618-4959-Fax\n",
-      "Spans: [Span(type: phone_number, value: 781-618-4959, char_span: [118: 130]), Span(type: phone_number, value: (73) 4746-3459, char_span: [95: 109]), Span(type: postcode, value: 35172, char_span: [88: 93]), Span(type: country, value: Brazil, char_span: [81: 87]), Span(type: city, value: Teixeira de Freitas, char_span: [59: 78]), Span(type: secondary_address, value: Apt. 559, char_span: [48: 56]), Span(type: street_name, value: Rua Arapiraca 1943, char_span: [27: 45]), Span(type: building_number, value: 35116, char_span: [21: 26]), Span(type: person, value: Francesca Freeman, char_span: [0: 17])]\n",
+      "Full text: Meet me at Unit 8161 Box 6817\n",
+      "DPO AE 26241\n",
+      "Spans: [Span(type: address, value: Unit 8161 Box 6817\n",
+      "DPO AE 26241, char_span: [11: 42])]\n",
       "\n",
-      "Full text: 3... 2... 1... liftoff!\n",
+      "Full text: How do I open my credit card statement?\n",
       "Spans: []\n",
       "\n"
      ]
@@ -721,7 +729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 14,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -735,9 +743,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'../data/generated_size_1500_date_January_06_2025.json'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "output_file"
    ]
@@ -755,7 +774,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 16,
    "metadata": {
     "is_executing": true,
     "pycharm": {
@@ -767,7 +786,21 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1500/1500 [00:00<00:00, 47248.41it/s]\n"
+      "  0%|          | 0/1500 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading model en_core_web_sm\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1500/1500 [00:03<00:00, 386.94it/s]\n"
      ]
     },
     {
@@ -805,110 +838,110 @@
        "      <td>The</td>\n",
        "      <td>DET</td>\n",
        "      <td>DT</td>\n",
-       "      <td>163</td>\n",
+       "      <td>110</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>bus</td>\n",
+       "      <td>title</td>\n",
        "      <td>NOUN</td>\n",
        "      <td>NN</td>\n",
-       "      <td>163</td>\n",
+       "      <td>110</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>station</td>\n",
-       "      <td>NOUN</td>\n",
-       "      <td>NN</td>\n",
-       "      <td>163</td>\n",
-       "      <td>O</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>is</td>\n",
-       "      <td>AUX</td>\n",
+       "      <td>refers</td>\n",
+       "      <td>VERB</td>\n",
        "      <td>VBZ</td>\n",
-       "      <td>163</td>\n",
+       "      <td>110</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>on</td>\n",
+       "      <th>3</th>\n",
+       "      <td>to</td>\n",
        "      <td>ADP</td>\n",
        "      <td>IN</td>\n",
-       "      <td>163</td>\n",
+       "      <td>110</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>Via</td>\n",
+       "      <th>4</th>\n",
+       "      <td>Riddersporen</td>\n",
        "      <td>PROPN</td>\n",
        "      <td>NNP</td>\n",
-       "      <td>163</td>\n",
+       "      <td>110</td>\n",
        "      <td>B-street_name</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>Pasquale</td>\n",
-       "      <td>PROPN</td>\n",
-       "      <td>NNP</td>\n",
-       "      <td>163</td>\n",
+       "      <th>5</th>\n",
+       "      <td>1</td>\n",
+       "      <td>NUM</td>\n",
+       "      <td>CD</td>\n",
+       "      <td>110</td>\n",
        "      <td>I-street_name</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>street</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>110</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>Scura</td>\n",
-       "      <td>PROPN</td>\n",
-       "      <td>NNP</td>\n",
-       "      <td>163</td>\n",
-       "      <td>I-street_name</td>\n",
+       "      <td>in</td>\n",
+       "      <td>ADP</td>\n",
+       "      <td>IN</td>\n",
+       "      <td>110</td>\n",
+       "      <td>O</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>127</td>\n",
-       "      <td>NUM</td>\n",
-       "      <td>CD</td>\n",
-       "      <td>163</td>\n",
-       "      <td>I-street_name</td>\n",
+       "      <td>STAVANGER</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>NNP</td>\n",
+       "      <td>110</td>\n",
+       "      <td>B-city</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>Leigha</td>\n",
-       "      <td>VERB</td>\n",
-       "      <td>VB</td>\n",
-       "      <td>189</td>\n",
-       "      <td>B-name</td>\n",
-       "      <td>1</td>\n",
+       "      <td>.</td>\n",
+       "      <td>PUNCT</td>\n",
+       "      <td>.</td>\n",
+       "      <td>110</td>\n",
+       "      <td>O</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "       text    pos  tag  template_id          label  sentence\n",
-       "0       The    DET   DT          163              O         0\n",
-       "1       bus   NOUN   NN          163              O         0\n",
-       "2   station   NOUN   NN          163              O         0\n",
-       "3        is    AUX  VBZ          163              O         0\n",
-       "4        on    ADP   IN          163              O         0\n",
-       "5       Via  PROPN  NNP          163  B-street_name         0\n",
-       "6  Pasquale  PROPN  NNP          163  I-street_name         0\n",
-       "7     Scura  PROPN  NNP          163  I-street_name         0\n",
-       "8       127    NUM   CD          163  I-street_name         0\n",
-       "9    Leigha   VERB   VB          189         B-name         1"
+       "           text    pos  tag  template_id          label  sentence\n",
+       "0           The    DET   DT          110              O         0\n",
+       "1         title   NOUN   NN          110              O         0\n",
+       "2        refers   VERB  VBZ          110              O         0\n",
+       "3            to    ADP   IN          110              O         0\n",
+       "4  Riddersporen  PROPN  NNP          110  B-street_name         0\n",
+       "5             1    NUM   CD          110  I-street_name         0\n",
+       "6        street   NOUN   NN          110              O         0\n",
+       "7            in    ADP   IN          110              O         0\n",
+       "8     STAVANGER  PROPN  NNP          110         B-city         0\n",
+       "9             .  PUNCT    .          110              O         0"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -920,7 +953,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 17,
    "metadata": {
     "is_executing": true,
     "pycharm": {

	number	gender	nationality	prefix	first_name	middle_initial	last_name	street_name	city	state_abbr	...	company	domain_name	person	name	first_name_female	first_name_male	prefix_female	prefix_male	last_name_female	last_name_male
0	1	female	Czech	Mrs.	Marie	J	Hamanová	P.O. Box 255	Kangerlussuaq	QE	...	Simple Solutions	MarathonDancing.gl	Marie Hamanová	Marie Hamanová	Marie		Mrs.		Hamanová
1	2	female	French	Ms.	Patricia	G	Desrosiers	Avenida Noruega 42	Vila Real	VR	...	Formula Gray	LostMillions.com.pt	Patricia Desrosiers	Patricia Desrosiers	Patricia		Ms.		Desrosiers
2	3	female	American	Ms.	Debra	O	Neal	1659 Hoog St	Brakpan	GA	...	Dahlkemper's	MediumTube.co.za	Debra Neal	Debra Neal	Debra		Ms.		Neal
3	4	male	French	Mr.	Peverell	C	Racine	183 Epimenidou Street	Limassol	LI	...	Quickbiz	ImproveLook.com.cy	Peverell Racine	Peverell Racine		Peverell		Mr.		Racine
4	5	female	Slovenian	Mrs.	Iolanda	S	Tratnik	Karu põik 61	Pärnu	PR	...	Dubrow's Cafeteria	PostTan.com.ee	Iolanda Tratnik	Iolanda Tratnik	Iolanda		Mrs.		Tratnik
	text	pos	tag	template_id	label	sentence
0	The	DET	DT	163	O	0
1	bus	NOUN	NN	163	O	0
2	station	NOUN	NN	163	O	0
3	is	AUX	VBZ	163	O	0
4	on	ADP	IN	163	O	0
5	Via	PROPN	NNP	163	B-street_name	0
6	Pasquale	PROPN	NNP	163	I-street_name	0
7	Scura	PROPN	NNP	163	I-street_name	0
8	127	NUM	CD	163	I-street_name	0
9	Leigha	VERB	VB	189	B-name	1
10	C	PROPN	NNP	189	I-name	1
11	Mackay\\n\\nLa	NOUN	NN	189	I-name	1
12	Sagne\\nSwitzerland	PROPN	NNP	189	B-city	1
13	Can	AUX	MD	57	O	2
14	someone	PRON	NN	57	O	2
15	call	VERB	VB	57	O	2
16	me	PRON	PRP	57	O	2
17	on	ADP	IN	57	O	2
18	06	NUM	CD	57	B-phone_number	2
19	-	SYM	SYM	57	I-phone_number	2