-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscript.py
82 lines (70 loc) · 3.11 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import openai
import csv
import os
from schema.schemas import actor_schema, actions_schema, emissions_agg_schema, datasource_schema, target_schema
openai.api_key = os.environ.get("OPENAI_KEY")
CURRENT_YEAR = 2023
file_name = 'unfccc-data/New Zealand/Table1s1-Table 1.csv'
sample_rows = []
with open(file_name, 'r') as file:
reader = csv.reader(file)
all_lines = []
for row in reader:
all_lines.append(row)
sample_rows = all_lines[0:9]
sample_rows.append(all_lines[-2])
sample_rows.append(all_lines[-1])
sample_rows_string = ""
for row in sample_rows:
sample_rows_string += '\n'.join(row)
start_prompt = f"""
Using this sample input data, generate Python code in markdown
that reads in the file {file_name} and maps to this schema
Use the name without extension {file_name} + year for the datasource_id
Use the file name for the name field
""" + sample_rows_string
additional_prompts = """
Find the actor name in the folder name of the file path and it's a country
Query the api https://openclimate.openearth.dev/api/v1/search/actor with query parameter name to get actor_id.
The response from the api looks like {
data: [ {actor_id:''} ]
}
"""
# Used to process data from https://unfccc.int/reports
country_emissions_prompt = f"""
Here are the instructions for finding the right data:
The first few rows may be invalid
Total_emissions should be in a total column
Use math.floor function to convert emissions to integers
Year field is hardcoded to {CURRENT_YEAR}
The emissions_id is the file name plus the year
"""
output_prompt = " output to a csv file with header named {}.csv"
prompts = {
'DataSource': ' '.join([start_prompt, datasource_schema, output_prompt.format("datasource")]),
'Actor': ' '.join([start_prompt, additional_prompts, actor_schema, output_prompt.format("actor")]),
'EmissionsAgg': ' '.join([start_prompt, emissions_agg_schema, additional_prompts, country_emissions_prompt,
output_prompt.format("emissions_agg")]),
'Target': ' '.join([start_prompt, additional_prompts, target_schema, output_prompt.format("target")]),
'Action': ' '.join([start_prompt, additional_prompts, actions_schema, output_prompt.format("action")]),
}
for (schema_name, prompt) in prompts.items():
print('running for prompt: ' + schema_name)
print(prompt)
values = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": """
You are a data analyst tasked with mapping a bunch of dirty CSV files to a conformed schema.
"""},
{"role": "user", "content": prompt}
],
max_tokens=4096
)
output_message = values["choices"][0]["message"]["content"].split('```')
valid_messages = filter(lambda x: x.startswith('python'), output_message)
valid_messages = map(lambda x: x[6:], valid_messages)
# Open the file with write permissions
with open('output/' + file_name.split('/').pop() + '.' + schema_name + '.py', 'w') as file:
# Write some data to the file
file.write('\n'.join(valid_messages))