-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnormalizer.py
107 lines (78 loc) · 3.55 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import random
import numpy as np
# Turbidity Normalization
# Load your dataset with the 'turbidity' column (replace 'your_dataset.csv' with your file path)
df = pd.read_csv("datasets/final_data/aggregated_data.csv")
# Convert the 'Turbidity (FNU)' to numeric, coercing non-numeric values to NaN
df["Turbidity (FNU)"] = pd.to_numeric(df["Turbidity (FNU)"], errors="coerce")
# Define the target range for normalization
target_min = 0.1
target_max = 10
# Normalize the 'Turbidity (FNU)' while ignoring NaN values
df["normalized_turbidity"] = df["Turbidity (FNU)"].apply(
lambda x: (
(x - df["Turbidity (FNU)"].min())
/ (df["Turbidity (FNU)"].max() - df["Turbidity (FNU)"].min())
)
* (target_max - target_min)
+ target_min
if not pd.isna(x)
else None
)
# Now, df['normalized_turbidity'] contains the normalized turbidity values, with NaN for null and empty values
print(df["normalized_turbidity"])
# Water Temp Normalization
df["Mean Water Temp (C)"] = pd.to_numeric(df["Mean Water Temp (C)"], errors="coerce")
# Assuming your water temperature column is named 'Mean Water Temp (C)', find the minimum and maximum values
min_temp = df["Mean Water Temp (C)"].min()
max_temp = df["Mean Water Temp (C)"].max()
# Define the target range for normalization
target_min = 1
target_max = 10
# Normalize the 'Mean Water Temp (C)'
df["normalized_water_temp"] = (
(df["Mean Water Temp (C)"] - min_temp) / (max_temp - min_temp)
) * (target_max - target_min) + target_min
# Now, df['normalized_water_temp'] contains the normalized water temperature values
print(df["normalized_water_temp"])
# Mean Specific Conductance Normalization
df["Mean Specific Conductance"] = pd.to_numeric(
df["Mean Specific Conductance"], errors="coerce"
)
# Assuming your "Mean Specific Conductance" column is named 'Mean Specific Conductance', find the minimum and maximum values
min_conductance = df["Mean Specific Conductance"].min()
max_conductance = df["Mean Specific Conductance"].max()
# Define the target range for normalization
target_min = 0
target_max = 10
# Normalize the "Mean Specific Conductance" column
df["normalized_specific_conductance"] = (
(df["Mean Specific Conductance"] - min_conductance)
/ (max_conductance - min_conductance)
) * (target_max - target_min) + target_min
# Now, df['normalized_specific_conductance'] contains the normalized specific conductance values
print(df["normalized_specific_conductance"])
# pH Normalization
# Define the mean and standard deviation for the Gaussian distribution
mean_pH = (
7.5 # Adjust the mean as needed to center the distribution within the desired range
)
std_deviation = 1.0 # Adjust the standard deviation as needed
# Generate random pH values with a Gaussian (normal) distribution within the range [5, 10]
df["pH"] = [
max(5, min(10, random.gauss(mean_pH, std_deviation))) for _ in range(len(df))
]
# Assuming your pH column is named 'pH_column', calculate the mean pH value
mean_pH = df["pH"].mean()
# Calculate the absolute distance of each pH value from the mean
df["distance_from_mean"] = abs(df["pH"] - mean_pH)
# Normalize the distances to a range between 0 and 10
min_distance = df["distance_from_mean"].min()
max_distance = df["distance_from_mean"].max()
df["normalized_pH"] = 10 - (
(df["distance_from_mean"] - min_distance) / (max_distance - min_distance) * 10
)
print(df["normalized_pH"])
print(df.columns)
df.to_csv("datasets/final_data/normalized_data.csv", index=False)