-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpanda_util.py
executable file
·440 lines (356 loc) · 14.9 KB
/
panda_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#!/usr/bin/env python3
print(";; ...doing...",__name__)
# Version Space Candidate Elimination inside of MeTTa
# This implementation focuses on bringing this machine learning algorithm into the MeTTa relational programming environment.
# Douglas R. Miles 2023
# Standard Library Imports
import atexit, io, inspect, json, os, re, subprocess, sys, traceback
from collections import Counter
from glob import glob
from time import monotonic_ns, time
# Global Variables
VSPACE_VERBOSE = os.environ.get("VSPACE_VERBOSE")
# 0 = for scripts/demos
# 1 = developer
# 2 = debugger
verbose = 1
if VSPACE_VERBOSE is not None:
try: verbose = int(VSPACE_VERBOSE) # Convert it to an integer
except ValueError: ""
def timeFrom(w, t0):
elapsed_ns = monotonic_ns() - t0
elapsed_s = elapsed_ns / 1e9
elapsed_ms = elapsed_ns / 1e6
elapsed_us = elapsed_ns / 1e3
if elapsed_s >= 1:
print_cmt(f"{w} took {elapsed_s:.5f} seconds")
elif elapsed_ms >= 1:
print_cmt(f"{w} took {elapsed_ms:.5f} milliseconds")
else:
print_cmt(f"{w} took {elapsed_us:.5f} microseconds")
def redirect_stdout(inner_function):
old_stdout = sys.stdout # Save the current stdout stream
new_stdout = io.StringIO() # Create a new StringIO buffer
sys.stdout = new_stdout # Redirect stdout to the new buffer
try:
inner_function() # Execute the inner function
finally:
sys.stdout = old_stdout # Restore the original stdout stream
output = new_stdout.getvalue() # Retrieve the output from the new buffer
new_stdout.close() # Close the new buffer
return output
def flush_console():
try:
if sys.__stdout__ is not None: sys.__stdout__.flush()
except Exception: ""
try:
if sys.__stderr__ is not None: sys.__stderr__.flush()
except Exception: ""
try:
if sys.stderr is not None and not (sys.stderr is sys.__stderr__): sys.sys.stderr.flush()
except Exception: ""
try:
if sys.stdout is not None and not (sys.stdout is sys.__stdout__): sys.sys.stdout.flush()
except Exception: ""
# Exporting to another CSV (for demonstration)
#df.to_csv("exported.csv", index=False)
#print_cmt("\n### Data Exported to 'exported.csv' ###")
import os
import pandas as pd
import re
import sys
import chardet
def detect_encoding(file_path, sample_size=20000):
with open(file_path, 'rb') as f:
raw = f.read(sample_size)
return chardet.detect(raw)['encoding']
from collections.abc import Iterable
def is_lisp_dashed(s):
pattern = re.compile('^[A-Za-z0-9-_:]+$')
return bool(pattern.match(s))
def item_string(lst, functor=""):
if isinstance(lst, str):
if len(lst) == 0:
return '""'
if any(char in lst for char in [' ', '"', "'", "(", ")", ".", "\\"]):
return json.dumps(lst)
if isinstance(lst, (int, float)):
return repr(lst)
if is_float_string(lst):
return repr(float(lst))
if lst.isdigit():
return repr(int(lst))
if lst.isalnum():
if lst[0].isdigit(): return json.dumps(lst)
return lst
if lst=="#":
return lst
if is_lisp_dashed(lst):
return lst
return json.dumps(lst)
try:
if isinstance(lst, Iterable):
return '(' + functor + ' '.join([item_string(vv) for vv in lst]) + ')'
else:
return str(lst)
except TypeError:
return str(lst)
def list_string(lst, functor="# "):
try:
if isinstance(lst, Iterable) and not isinstance(lst, str):
if len(lst) == 0:
return '()'
return '(' + functor + ' '.join([item_string(vv) for vv in lst]) + ')'
else:
return item_string(lst)
except TypeError:
return item_string(lst)
def is_float_string(s):
return bool(re.fullmatch(r'[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?', s))
import pandas as pd
def update_dataframe_skipping_first_row(df):
"""
Takes a DataFrame, skips the first row, recalculates unique counts and value counts,
and infers the most appropriate datatypes for each column.
Parameters:
df (pandas.DataFrame): The original DataFrame.
Returns:
pandas.DataFrame: A DataFrame with the first row removed, updated with recalculated
uniqueness and value counts, and with inferred datatypes.
"""
# Check if the DataFrame is empty or has only one row
if df.empty or df.shape[0] == 1:
raise ValueError("DataFrame is empty or has only one row, which cannot be skipped.")
# Skip the first row and reset the index
updated_df = df.iloc[1:].reset_index(drop=True)
# Attempt to infer better dtypes for object columns
updated_df = updated_df.infer_objects()
updated_df = updated_df.convert_dtypes()
# Update DataFrame with uniqueness and value counts for each column
for col in updated_df.columns:
updated_df[f'{col}_unique_count'] = updated_df[col].nunique()
updated_df[f'{col}_value_counts'] = updated_df[col].value_counts().to_dict().__str__()
return updated_df
def analyze_csv_basename(file_path, sep=None):
base_name = os.path.basename(file_path)
base_name = base_name.replace('.tsv', '')
base_name = base_name.replace('.fb', '')
# Remove a sequence like _fb_####_## (where # represents a digit)
base_name = re.sub(r'_fb_\d{4}_\d{2}', '', base_name)
# Remove a sequence like ####_## at any place
base_name = re.sub(r'\d{4}_\d{2}', '', base_name)
# Replace periods with underscores, if not part of a file extension
base_name = re.sub(r'\.(?=.*\.)', '_', base_name)
analyze_csv(base_name, file_path, sep=sep)
needed_Skip = 0
def analyze_csv(base_name, file_path, sep=None):
print_cmt(";;------------------------------------------------------------------------------------------------------------------")
print_cmt(f"Analyzing file: {file_path}")
missing_values_list = ["","-"," ","|",",","#", "*", "\"\"", "+", "NULL", "N/A", "--",
"NaN","EMPTY","None","n/a","(none)",
# "0","Dmel","-1",
"MISSING", "?", "undefined", "unknown", "none", "[]", "."]
def read_csv(enc, skip_rows, header_option):
false_values_list = ["F", "f", "False", "false", "N", "No", "no", "FALSE"]
true_values_list = ["T", "t", "True", "true", "Y", "Yes", "yes", "TRUE"]
engine = 'python' if sep is None else 'c'
return pd.read_csv(
file_path,
sep=sep,
encoding=enc,
comment='#',
compression='infer',
true_values=true_values_list,
false_values=false_values_list,
engine=engine,
header=header_option,
skiprows=skip_rows,
#names=header_names,
keep_default_na=False,
skip_blank_lines=True,
on_bad_lines='skip'
)
def read_csv_both_encodings(skip_rows=None, header_option=None):
try:
return read_csv('utf-8', skip_rows, header_option)
except UnicodeDecodeError:
encoding = detect_encoding(file_path)
if encoding=='uft-8':
print_cmt(";; Trying '{encoding}' encoding...")
try:
return read_csv(encoding, skip_rows)
except Exception as e:
print_cmt(f";; Error reading the file with 'utf-8' encoding: {e}")
return None
except Exception as e:
print_cmt(f";; Error reading the file: {e}")
return None
df = read_csv_both_encodings()
# Function to check if a string contains any digits
def contains_digit(s):
return any(char.isdigit() for char in s)
# Read the first few rows to check for digits
header_candidates = df.head(3)
first_row_has_no_digits = all(not contains_digit(str(value)) for value in header_candidates.iloc[0])
second_third_row_has_digits = any(contains_digit(str(value)) for value in header_candidates.iloc[1]) and any(contains_digit(str(value)) for value in header_candidates.iloc[2])
global needed_Skip
# If the first row has no digits but the second and third do, treat the first row as a header
if first_row_has_no_digits and second_third_row_has_digits:
print_cmt("First row is set as header based on the digit check.")
df = read_csv_both_encodings(skip_rows=1, header_option=None)
needed_Skip = 1
old_columns = header_candidates.iloc[0]
else:
print_cmt("Digit check is inconclusive for determining a header row. No header set.")
old_columns = df.columns
# If the columns should be anonymized or kept as is, handle that here
need_anon_columns = False
for col in old_columns:
if not re.match("^[a-zA-Z]+$", str(col)):
need_anon_columns = True
break
new_columns = [f'{i+1}' for i in range(df.shape[1])] if need_anon_columns else old_columns.tolist()
if need_anon_columns:
df.columns = new_columns
col_names = ' '.join([f"{col}" for col in new_columns])
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
def metta_read(str):
print(str)
# res = the_python_runner.run(json.dumps(str))
# if len(res) != 0: print_cmt(";;;="+ repr(res))
metta_read(f"!(file-name {base_name} {file_path})")
metta_read(f"(num-columns {base_name} {df.shape[1]})")
metta_read(f"(column-names {base_name} {list_string(old_columns)})")
metta_read(f"(column-names-maybe {base_name} {list_string(df.columns)})")
metta_read(f"(duplicated-rows {base_name} {df.duplicated().sum()})")
metta_read(f"(total-rows {base_name} {len(df)})")
for col in new_columns:
metta_read(f"(unique-values {base_name} {col} {df[col].nunique()} {df[col].dtype})")
# Print the unique NA values and their frequencies for each column
for col in df.columns:
missing_count = 0
unique_na_values = []
frequency_of_unique_na = []
for na_val in missing_values_list:
count = df[df[col] == na_val].shape[0]
if count > 0:
missing_count += count
metta_read(f"(null-value-count {base_name} {col} \"{na_val}\" {count})")
unique_na_values.append(na_val)
frequency_of_unique_na.append(count)
metta_read(f"(missing-values {base_name} {col} {missing_count} {list_string(unique_na_values)} {list_string(frequency_of_unique_na)})")
hl =7
for col in df.columns:
if len(df) != df[col].nunique():
isfrequents = [["#", val, cnt] for val, cnt in df[col].value_counts(ascending=False).head(hl).items() if val not in missing_values_list or len(val)>3]
isfrequents.reverse()
metta_read(f"(most-frequent {base_name} {col} {list_string(isfrequents)})\n")
infrequents = [["#", val, cnt] for val, cnt in df[col].value_counts(ascending=True).head(hl).items() if val not in missing_values_list or len(val)>3]
#infrequents.reverse()
#infrequents.reverse() # Since we can't use slicing on a generator, we reverse it here
metta_read(f"(less-frequent {base_name} {col} {list_string(infrequents)})\n")
#metta_read(f"(data-types {base_name} {col} {col.dtype} )")
def import_metta_file(string):
global argmode
if argmode=="mettalog":
load_vspace()
swip_exec(f"load_metta_file('{selected_space_name}','{string}')")
else: the_python_runner.import_file(string)
import os
import sys
def vspace_main(*args):
is_init=False
#os.system('clear')
t0 = monotonic_ns()
flush_console()
#if is_init==False: load_vspace()
#if is_init==False: load_flybase()
#if is_init==False:
if isinstance(args, str):
handle_arg(args)
elif isinstance(args, list):
for arg in args:
if isinstance(arg, str):
if len(arg) > 1: handle_arg(arg)
flush_console()
global argmode
#the_python_runner.repl(mode=argmode)
flush_console()
if verbose>1: timeFrom("main", t0)
flush_console()
def vspace_main_from_python(sysargv1toN):
vspace_main(sysargv1toN)
def handle_arg(string, skip_filetypes=['.metta', '.md','.pl', '.png', '.jpg', '.obo']):
lower = string.lower()
if lower in ["--metta","--mettalog","--python"]:
global argmode
argmode = lower.lstrip('-')
if verbose>0: print("; argmode=", argmode)
return
if os.path.isfile(string):
if lower.endswith('.metta'):
if verbose>0: print("; import_metta_file=", string)
#import_metta_file(string)
return
global needed_Skip
if string=="--analyze": sys.exit(needed_Skip)
if os.path.isdir(string):
# If it's a directory, traverse it
for root, _, files in os.walk(string):
for file in files:
try:
if any(file.endswith(ext) for ext in skip_filetypes):
if verbose>0: print_cmt(f"Skipping file: {file}")
continue
handle_arg([os.path.join(root, file)], skip_filetypes)
except Exception as e:
print_cmt(f"An error occurred while processing {string}: {e}")
return
elif os.path.isfile(string):
if lower.endswith('.csv'):
analyze_csv_basename(string, sep=',')
return
elif lower.endswith('.tsv'):
analyze_csv_basename(string, sep='\t')
return
else:
# Read only the first few lines
try:
analyze_csv_basename(string)
except UnicodeDecodeError:
print_cmt(f"Passing in file: {string}")
with open(string, 'r') as file:
for i, line in enumerate(file):
if i >= 10:
break
print_cmt(line.strip())
return
print_cmt(f"Skipping: {string}")
def print_cmt(*args, prefix=";; "):
for arg in args:
println(arg, prefix=prefix)
flush_console()
def println(orig, prefix=""):
"""
Prints the given object and returns it.
Args:
orig: The object to be printed.
Returns:
The same object that was passed in.
"""
try:
prefix_print(prefix, orig)
except Exception as e:
if verbose>0: print_cmt(f"println-Error: {e}")
flush_console()
return orig
def prefix_print(prefix, orig):
obj = orig
if isinstance(obj, str):
objlns = obj.splitlines()
for r in objlns:
print(prefix, r)
return
is_init=False
if __name__ == "__main__":
vspace_main_from_python(sys.argv[1:])