Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CC BY-NC-SA 4.0 #149

Open
wants to merge 64 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
43a434c
Add files via upload
Jackie-Soo Oct 29, 2024
cd2f7fb
Update exercise1.jv
Jackie-Soo Oct 29, 2024
7539720
second update exercise1.jv
Jackie-Soo Oct 29, 2024
f44275e
exe2,0
Jackie-Soo Nov 5, 2024
5389b83
Create project-plan.md
Jackie-Soo Nov 5, 2024
3a720fc
project-plan.md
Jackie-Soo Nov 5, 2024
17e53d4
Initial project-plan.md
Jackie-Soo Nov 7, 2024
7019fde
Initial project-plan.md 2,0
Jackie-Soo Nov 7, 2024
82d563f
Initial project-plan.md 3,0
Jackie-Soo Nov 7, 2024
14b6ca7
Update project-plan.md
Jackie-Soo Nov 10, 2024
4ef18ce
Update project-plan.md
Jackie-Soo Nov 10, 2024
e83f947
Project3
Jackie-Soo Nov 12, 2024
540158e
Create pipeline.py
Jackie-Soo Nov 12, 2024
a47e817
Rename pipeline.py to pipeline.sh
Jackie-Soo Nov 12, 2024
6570a81
Update project-plan.md
Jackie-Soo Nov 12, 2024
d79ae4f
ex3 1,0
Jackie-Soo Nov 17, 2024
c73e136
ex3 2,0
Jackie-Soo Nov 17, 2024
3510d50
ex3 3,0
Jackie-Soo Nov 17, 2024
28243d8
Update project3.py
Jackie-Soo Nov 24, 2024
3000966
Update pipeline.sh
Jackie-Soo Nov 24, 2024
9b8718e
Update project3.py
Jackie-Soo Nov 27, 2024
cc84741
Add files via upload
Jackie-Soo Nov 27, 2024
2f656f9
Update project3.py
Jackie-Soo Nov 27, 2024
2c4d579
exe4,1
Jackie-Soo Dec 1, 2024
be14c4c
exercise4,2
Jackie-Soo Dec 1, 2024
6776832
exe4,3
Jackie-Soo Dec 1, 2024
4abe7ab
Add files via upload
Jackie-Soo Dec 1, 2024
5aa0527
Create tests.sh
Jackie-Soo Dec 1, 2024
70c98c4
Update system-test.py
Jackie-Soo Dec 1, 2024
fbded1b
Update system-test.py
Jackie-Soo Dec 2, 2024
a0290ae
Create run-project3.yml
Jackie-Soo Dec 8, 2024
b7775fc
Update run-project3.yml
Jackie-Soo Dec 9, 2024
a1b5072
Add files via upload
Jackie-Soo Dec 9, 2024
34cae16
Update requirements.txt
Jackie-Soo Dec 9, 2024
bd300a2
Update run-project3.yml
Jackie-Soo Dec 9, 2024
b05957b
Update run-project3.yml
Jackie-Soo Dec 9, 2024
7b07b69
Update requirements.txt
Jackie-Soo Dec 9, 2024
3732346
Update run-project3.yml
Jackie-Soo Dec 9, 2024
44de32c
Update project3.py
Jackie-Soo Dec 9, 2024
7762536
Update run-project3.yml
Jackie-Soo Dec 9, 2024
fc5e2b1
Update project3.py
Jackie-Soo Dec 11, 2024
918ed4b
Update tests.sh
Jackie-Soo Dec 11, 2024
9753261
Update and rename system-test.py to system_test.py
Jackie-Soo Dec 11, 2024
dbec13c
Update project3.py
Jackie-Soo Dec 11, 2024
4237de2
Update project3.py
Jackie-Soo Dec 11, 2024
7c1f488
Create tests.yml
Jackie-Soo Dec 11, 2024
6c85ad5
Update tests.yml
Jackie-Soo Dec 11, 2024
e6c75cb
Update tests.yml
Jackie-Soo Dec 11, 2024
765f87f
Update system_test.py
Jackie-Soo Dec 11, 2024
be2b4ee
Update tests.sh
Jackie-Soo Dec 11, 2024
fc8f71c
Update tests.sh
Jackie-Soo Dec 11, 2024
57fcc42
Update pipeline.sh
Jackie-Soo Dec 11, 2024
9d367eb
Update pipeline.sh
Jackie-Soo Dec 11, 2024
a8c7f4e
Update tests.yml
Jackie-Soo Dec 11, 2024
4b0a012
exe5,0
Jackie-Soo Dec 14, 2024
5b8ea4c
exe5,1
Jackie-Soo Dec 15, 2024
3bf6575
exe5,2
Jackie-Soo Dec 15, 2024
29f1ec7
exe5,3
Jackie-Soo Dec 15, 2024
4c81108
Add files via upload
Jackie-Soo Jan 9, 2025
16b9881
Add files via upload
Jackie-Soo Jan 9, 2025
8cbbe9e
Create LICENSE
Jackie-Soo Jan 17, 2025
8f92c3f
Update README.md
Jackie-Soo Jan 17, 2025
62c8bde
Update README.md
Jackie-Soo Jan 17, 2025
589bc60
Update README.md
Jackie-Soo Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add files via upload
  • Loading branch information
Jackie-Soo authored Dec 1, 2024
commit 4abe7ab804b4c704b35b26c059143a51ee142385
175 changes: 175 additions & 0 deletions project/system-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import unittest
from unittest.mock import patch, Mock
import os
import pandas as pd
from project3 import main, download_dataset, read_csv, transform_crime_enforcement_data, transform_factors_data, load_to_db

TEST_DATA_DIR = r'dataset' # path of datasets
DATABASE_PATH = os.path.join('data', 'project3.db') # path of the output file


class SystemTestDataPipeline(unittest.TestCase):

@patch('project3.kagglehub.dataset_download')
@patch('project3.read_csv')
@patch('project3.sqlite3.connect')
def test_full_data_pipeline(self, mock_connect, mock_read_csv, mock_download_dataset):
# data downloading
mock_download_dataset.side_effect = [
os.path.join(TEST_DATA_DIR, 'crime_enforcement_data'), # download path1
os.path.join(TEST_DATA_DIR, 'factors_data') # download path2
]

# read csv
mock_read_csv.side_effect = [
pd.read_csv(os.path.join(TEST_DATA_DIR, 'crime_enforcement_data', 'ca_offenses_by_city.csv'), encoding='UTF-8'),
pd.read_csv(os.path.join(TEST_DATA_DIR, 'crime_enforcement_data', 'ca_law_enforcement_by_city.csv'), encoding='UTF-8'),
pd.read_csv(os.path.join(TEST_DATA_DIR, 'factors_data', 'MedianHouseholdIncome2015.csv'), encoding='ISO-8859-1'),
pd.read_csv(os.path.join(TEST_DATA_DIR, 'factors_data', 'PercentagePeopleBelowPovertyLevel.csv'), encoding='ISO-8859-1'),
pd.read_csv(os.path.join(TEST_DATA_DIR, 'factors_data', 'PercentOver25CompletedHighSchool.csv'), encoding='ISO-8859-1')
]

print(mock_read_csv.call_args_list) # check the path of 5 datasets

# connect the database
mock_conn = Mock()
mock_connect.return_value = mock_conn

# run main
main()

# validata if the datasets are downloaded
mock_download_dataset.assert_any_call("dataset/crime_enforcement_data")
mock_download_dataset.assert_any_call("dataset/factors_data")

# validate if the datasets are read
mock_read_csv.assert_any_call(os.path.join(TEST_DATA_DIR, 'crime_enforcement_data', 'ca_offenses_by_city.csv'))
mock_read_csv.assert_any_call(os.path.join(TEST_DATA_DIR, 'crime_enforcement_data', 'ca_law_enforcement_by_city.csv'))
mock_read_csv.assert_any_call(os.path.join(TEST_DATA_DIR, 'factors_data', 'MedianHouseholdIncome2015.csv'))
mock_read_csv.assert_any_call(os.path.join(TEST_DATA_DIR, 'factors_data', 'PercentagePeopleBelowPovertyLevel.csv'))
mock_read_csv.assert_any_call(os.path.join(TEST_DATA_DIR, 'factors_data', 'PercentOver25CompletedHighSchool.csv'))


c_data = mock_read_csv.side_effect[0] # 'ca_offenses_by_city.csv'
e_data = mock_read_csv.side_effect[1] # 'ca_law_enforcement_by_city.csv'
i_data = mock_read_csv.side_effect[2] # 'MedianHouseholdIncome2015.csv'
p_data = mock_read_csv.side_effect[3] # 'PercentagePeopleBelowPovertyLevel.csv'
h_data = mock_read_csv.side_effect[4] # 'PercentOver25CompletedHighSchool.csv'

# test the transformed data
transformed_data = transform_crime_enforcement_data(c_data, e_data)
transformed_factors_data = transform_factors_data(i_data, p_data, h_data)

expected_columns = [
'City', 'Violent crime', 'Murder and nonnegligent manslaughter',
'Rape (revised definition)', 'Robbery', 'Aggravated assault',
'Property crime', 'Burglary', 'Larceny-theft', 'Motor vehicle theft',
'Arson', 'Population', 'Total law enforcement employees',
'Total officers', 'Total civilians'
]
# validate column names
self.assertListEqual(list(transformed_data.columns), expected_columns)

expected_dtypes = {
'City': 'string',
'Violent crime': 'int32',
'Murder and nonnegligent manslaughter': 'int32',
'Rape (revised definition)': 'int32',
'Robbery': 'int32',
'Aggravated assault': 'int32',
'Property crime': 'int32',
'Burglary': 'int32',
'Larceny-theft': 'int32',
'Motor vehicle theft': 'int32',
'Arson': 'int32',
'Population': 'int32',
'Total law enforcement employees': 'int32',
'Total officers': 'int32',
'Total civilians': 'int32'
}
# validate datatype
for column, dtype in expected_dtypes.items():
with self.subTest(column=column):
self.assertEqual(transformed_data[column].dtype.name, dtype,
f"Column {column} dtype doesn't match")


expected_columns_factors = [
'City', 'Median Income', 'poverty_rate', 'percent_completed_hs'
]
# column names validation
self.assertListEqual(list(transformed_factors_data.columns), expected_columns_factors)

# expected datatype
expected_dtypes_factors = {
'City': 'string',
'Median Income': 'int32',
'poverty_rate': 'float64',
'percent_completed_hs': 'float64'
}
# datatype validation
for column, dtype in expected_dtypes_factors.items():
with self.subTest(column=column):
self.assertEqual(transformed_factors_data[column].dtype.name, dtype,
f"Column {column} dtype does not match")

# validate if data is loaded into the dataset
mock_conn.execute.assert_any_call(
"CREATE TABLE crime_enforcement ("
"City STRING, "
"Violent crime INT, "
"Murder and nonnegligent manslaughter INT, "
"Rape (revised definition) INT, "
"Robbery INT, "
"Aggravated assault INT, "
"Property crime INT, "
"Burglary INT, "
"Larceny-theft INT, "
"Motor vehicle theft INT, "
"Arson INT, "
"Population INT, "
"Total law enforcement employees INT, "
"Total officers INT, "
"Total civilians INT);"
)

mock_conn.execute.assert_any_call(
"CREATE TABLE factors ("
"City STRING, "
"Median Income INT, "
"poverty_rate FLOAT, "
"percent_completed_hs FLOAT);"
)

# validate if the data is inserted into the dataset
mock_conn.execute.assert_any_call(
"INSERT INTO crime_enforcement (City, Violent crime, Murder and nonnegligent manslaughter, "
"Rape (revised definition), Robbery, Aggravated assault, Property crime, Burglary, "
"Larceny-theft, Motor vehicle theft, Arson, Population, Total law enforcement employees, "
"Total officers, Total civilians) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
)

mock_conn.execute.assert_any_call(
"INSERT INTO factors (City, Median Income, poverty_rate, percent_completed_hs) VALUES (?, ?, ?, ?)"
)

# transaction validation, to see if it committed successfully
mock_conn.commit.assert_called_once()
# disconnection validation
mock_conn.close.assert_called_once()

# validate if the logging file exists
with open('project3.log', 'r') as log_file:
log_content = log_file.read()
self.assertIn('Successfully download Dataset', log_content)
self.assertIn('Successfully read CSV file', log_content)
self.assertIn('Successfully transform crime&enforcement data', log_content)
self.assertIn('Successfully load data to table', log_content)

# Verify if the database file exists after running the pipeline
self.assertTrue(os.path.exists(DATABASE_PATH),
"file 'project3.db' should exist in the 'data' directory.")


if __name__ == '__main__':
unittest.main()