-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfirst_ml_test.py
87 lines (65 loc) · 3.47 KB
/
first_ml_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""first_ML_test.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1-rN9s_rinkl3iM8Bmlkm2eS6ZIOyHzOm
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow.keras.layers import StringLookup, IntegerLookup, Normalization
import tensorflow as tf
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') # training data
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') # testing data
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')
CATEGORICAL_COLUMNS = ["sex", "class", "deck", "embark_town", "alone", "n_siblings_spouses", "parch"]
NUMERIC_COLUMNS = ["age", "fare"]
feature_preprocessors = {}
# Für kategoriale Daten
for feature_name in CATEGORICAL_COLUMNS:
vocabulary = dftrain[feature_name].unique()
if dftrain[feature_name].dtype == 'object': # String-basierte Kategorien
feature_preprocessors[feature_name] = StringLookup(vocabulary=vocabulary, output_mode='one_hot')
else: # Integer-basierte Kategorien
feature_preprocessors[feature_name] = IntegerLookup(vocabulary=vocabulary, output_mode='one_hot')
for feature_name in NUMERIC_COLUMNS:
# Daten vorbereiten
dftrain[feature_name] = pd.to_numeric(dftrain[feature_name], errors='coerce')
dftrain[feature_name] = dftrain[feature_name].fillna(0) # Fehlende Werte auffüllen
# Normalizer erstellen und anpassen
normalizer = Normalization(axis=None)
normalizer.adapt(dftrain[feature_name].to_numpy()) # Zu Numpy-Array konvertieren
feature_preprocessors[feature_name] = normalizer
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
def input_function(): # inner function, this will be returned
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) # create tf.data.Dataset object with data and its label
if shuffle:
ds = ds.shuffle(1000) # randomize order of data
ds = ds.batch(batch_size).repeat(num_epochs) # split dataset into batches of 32 and repeat process for number of epochs
return ds # return a batch of the dataset
return input_function # return a function object for use
train_input_fn = make_input_fn(dftrain, y_train) # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)
# Eingabeschicht erstellen
inputs = {name: tf.keras.Input(shape=(1,), name=name, dtype='string' if dftrain[name].dtype == 'object' else 'float32') for name in dftrain.columns}
encoded_features = [preprocessor(inputs[name]) for name, preprocessor in feature_preprocessors.items()]
x = tf.keras.layers.concatenate(encoded_features)
# Einfaches lineares Modell
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
first_model = tf.keras.Model(inputs=inputs, outputs=output)
# Modell kompilieren und trainieren
first_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
first_model.fit(train_input_fn(), epochs=10)
# Modell auswerten
result = first_model.evaluate(eval_input_fn())
print(f"Accuracy: {result[1]}")
print(result)
id = 2
result = first_model.predict(eval_input_fn())
print(dfeval.loc[id])
print(y_eval[id])
print(result[id])