From 3785fc800276414d17b57db938bf0d3d6401b2e1 Mon Sep 17 00:00:00 2001 From: Serge Retkowsky Date: Thu, 12 Oct 2023 17:24:45 +0200 Subject: [PATCH] Add files via upload --- PII analysis/PII analysis.ipynb | 354 ++++++++++++++++++++++++++++++++ PII analysis/azure.env | 4 + 2 files changed, 358 insertions(+) create mode 100644 PII analysis/PII analysis.ipynb create mode 100644 PII analysis/azure.env diff --git a/PII analysis/PII analysis.ipynb b/PII analysis/PII analysis.ipynb new file mode 100644 index 0000000..ef5162e --- /dev/null +++ b/PII analysis/PII analysis.ipynb @@ -0,0 +1,354 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "20e46c2f", + "metadata": {}, + "source": [ + "# PII analysis with Azure Open AI" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a9ed10c4", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import openai\n", + "import os\n", + "import sys\n", + "\n", + "from dotenv import load_dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bacb98f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Open AI version: 0.28.1\n" + ] + } + ], + "source": [ + "load_dotenv(\"azure.env\")\n", + "\n", + "# Azure Open AI\n", + "openai.api_type: str = \"azure\"\n", + "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n", + "openai.api_base = os.getenv(\"OPENAI_API_BASE\")\n", + "openai.api_version = os.getenv(\"OPENAI_API_VERSION\")\n", + "\n", + "print(\"Open AI version:\", openai.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6b463bec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Today is: 12-Oct-2023 14:43:37\n" + ] + } + ], + "source": [ + "print(\"Today is:\", datetime.datetime.today().strftime(\"%d-%b-%Y %H:%M:%S\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3dd263bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sys.version" + ] + }, + { + "cell_type": "markdown", + "id": "bf083314", + "metadata": {}, + "source": [ + "## Function" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1a4b5443", + "metadata": {}, + "outputs": [], + "source": [ + "model = \"text-davinci-003\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c2dd2175", + "metadata": {}, + "outputs": [], + "source": [ + "def azure_openai(prompt, temperature=0.8):\n", + " \"\"\"\n", + " Get Azure Open AI results\n", + " \"\"\"\n", + " prompt = prompt + \"\\n\" + text\n", + "\n", + " results = openai.Completion.create(\n", + " engine=model,\n", + " prompt=prompt,\n", + " temperature=temperature,\n", + " max_tokens=800,\n", + " )\n", + "\n", + " answer = results[\"choices\"][0][\"text\"].strip(\"\\n\")\n", + "\n", + " return answer" + ] + }, + { + "cell_type": "markdown", + "id": "6ba8838d", + "metadata": {}, + "source": [ + "## PII analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5d782d44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Parker Doe has repaid all of their loans as of 2020-04-25.\n", + "Their SSN is 859-98-0987. To contact them, use their phone number\n", + "555-555-5555. They are originally from Brazil and have Brazilian CPF number 998.214.865-68\n", + "\n" + ] + } + ], + "source": [ + "text = \"\"\"\n", + "Parker Doe has repaid all of their loans as of 2020-04-25.\n", + "Their SSN is 859-98-0987. To contact them, use their phone number\n", + "555-555-5555. They are originally from Brazil and have Brazilian CPF number 998.214.865-68\n", + "\"\"\"\n", + "\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b2a50475", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Personally identifiable information in this text:\n", + "1. Parker Doe \n", + "2. Social Security Number (SSN): 859-98-0987\n", + "3. Phone Number: 555-555-5555\n", + "4. Country of Origin: Brazil\n", + "5. CPF Number: 998.214.865-68\n" + ] + } + ], + "source": [ + "answer = azure_openai(\"What are the Personally identifiable information in this text?\")\n", + "print(answer)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1f0cdfe0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"PII\": {\n", + " \"Name\": \"Parker Doe\",\n", + " \"SSN\": \"859-98-0987\",\n", + " \"Phone Number\": \"555-555-5555\",\n", + " \"Country of Origin\": \"Brazil\",\n", + " \"CPF Number\": \"998.214.865-68\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "answer = azure_openai(\n", + " \"What are the Personally identifiable information in this text? Save in a json file\"\n", + ")\n", + "json = answer\n", + "print(json)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8c97e04", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6b1ec71b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RECORD #333582770390100 | MH | 85986313 | | 054351 | 2/14/2001 12:00:00 AM | CORONARY ARTERY DISEASE | Signed | DIS | Admission Date: 5/22/2001 eport Status: Signed Discharge Date: 4/24/2001 ADMISSION DIAGNOSIS: CORONARY ARTERY DISEASE. HISTORY OF PRESENT ILLNESS: The patient is a 54-year-old gentleman with a history of progressive angina over the past several months. The patient had a cardiac catheterization in July of this year revealing total occlusion of the RCA and 50% left main disease , with a strong family history of coronary artery disease with a brother dying at the age of 52 from a myocardial infarction and another brother who is status post coronary artery bypass grafting. The patient had a stress echocardiogram done on July , 2001 , which showed no wall motion abnormalities ,but this was a difficult study due to body habitus. The patient went for six minutes with minimal ST depressions in the anterior lateral leads , thought due to fatigue and wrist pain , his anginal equivalent. Due to the patient's increased symptoms and family history and history left main disease with total occasional of his RCA was referred for revascularization with open heart surgery.\"\n", + "\n" + ] + } + ], + "source": [ + "text = \"\"\"\n", + "RECORD #333582770390100 | MH | 85986313 | | 054351 | 2/14/2001 12:00:00 AM | \\\n", + "CORONARY ARTERY DISEASE | Signed | DIS | Admission Date: 5/22/2001 \\\n", + "eport Status: Signed Discharge Date: 4/24/2001 ADMISSION DIAGNOSIS: \\\n", + "CORONARY ARTERY DISEASE. HISTORY OF PRESENT ILLNESS: \\\n", + "The patient is a 54-year-old gentleman with a history of progressive angina over the past several months. \\\n", + "The patient had a cardiac catheterization in July of this year revealing total occlusion of the RCA and \\\n", + "50% left main disease , with a strong family history of coronary artery disease with a brother dying at \\\n", + "the age of 52 from a myocardial infarction and another brother who is status post coronary artery bypass grafting. \\\n", + "The patient had a stress echocardiogram done on July , 2001 , which showed no wall motion abnormalities ,\\\n", + "but this was a difficult study due to body habitus. The patient went for six minutes with minimal ST depressions \\\n", + "in the anterior lateral leads , thought due to fatigue and wrist pain , his anginal equivalent. Due to the patient's \\\n", + "increased symptoms and family history and history left main disease with total occasional of his RCA was referred \\\n", + "for revascularization with open heart surgery.\"\n", + "\"\"\"\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bd63c6b6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + "\"Personally Identifiable Information\": [\n", + "\"Name: MH\",\n", + "\"Age: 54\",\n", + "\"Date of birth: 2/14/2001\",\n", + "\"Diagnosis: Coronary Artery Disease\",\n", + "\"Admission Date: 5/22/2001\",\n", + "\"Discharge Date: 4/24/2001\",\n", + "\"Family History: Brother died at age 52 from a myocardial infarction and another brother who is status post coronary artery bypass grafting\"\n", + "]\n", + "}\n" + ] + } + ], + "source": [ + "answer = azure_openai(\n", + " \"What are the Personally identifiable information in this text? Extract this into a json file\"\n", + ")\n", + "print(answer)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a8cb2952", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Personally Identifiable Information: Patient’s name, age (54), gender (male), family history (brothers died at age 52 from myocardial infarction; brother who is status post coronary artery bypass grafting).\n", + "\n", + "Main Events: \n", + "July 2001: Cardiac catheterization reveals total occlusion of RCA and 50% left main disease; Stress echocardiogram done \n", + "May 22, 2001: Admission date \n", + "April 24, 2001: Discharge date\n" + ] + } + ], + "source": [ + "answer = azure_openai(\n", + " \"What are the Personally identifiable information in this text? Extract only the main events with their date\"\n", + ")\n", + "print(answer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f23f33b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10 - SDK v2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PII analysis/azure.env b/PII analysis/azure.env new file mode 100644 index 0000000..1d22e69 --- /dev/null +++ b/PII analysis/azure.env @@ -0,0 +1,4 @@ +# Azure Open AI +OPENAI_API_BASE = "tobereplaced" +OPENAI_API_KEY = "tobereplaced" +OPENAI_API_VERSION = "2023-05-15"