-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/4 parse datetime in model (#8)
* Added additional datetime columns to test csv's * Updated sample data and added first iteration of datetime parsing detection * Added print statement to detect date format * Added additional code to detect date format for date strings that are none. * Added additional regex date foramt * Added additional docstring. * Added additional regex date formats. * Add new columns to the model.json file * Added some failing pytest to the date time parser. * Added docstring to get_frame and changed an argument * Added a sentence to the get_dataframe method docstring * Added missing parameter alter_schema to entity get_dataframe * Added forked cdm repo.
- Loading branch information
1 parent
6bff211
commit b4805fa
Showing
15 changed files
with
755 additions
and
388 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,11 +4,11 @@ version = "0.1.0" | |
description = "Use pyspark to read CDM entities." | ||
authors = ["JulesHuisman <[email protected]>"] | ||
readme = "README.md" | ||
packages = [{include = "pyspark_cdm"}] | ||
packages = [{ include = "pyspark_cdm" }] | ||
|
||
[tool.poetry.dependencies] | ||
python = "<4.0,>=3.8" | ||
commondatamodel-objectmodel = "^1.7.3" | ||
commondatamodel-objectmodel = { git = "https://github.com/quantile-development/CDM.git", branch = "master", subdirectory = "objectModel/Python" } | ||
nest-asyncio = "^1.5.6" | ||
tenacity = "^8.2.3" | ||
|
||
|
@@ -24,7 +24,4 @@ requires = ["poetry-core"] | |
build-backend = "poetry.core.masonry.api" | ||
|
||
[tool.pytest.ini_options] | ||
filterwarnings = [ | ||
"error", | ||
"ignore::DeprecationWarning", | ||
] | ||
filterwarnings = ["error", "ignore::DeprecationWarning"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import re | ||
from pyspark.sql import DataFrame | ||
from pyspark.sql.functions import to_timestamp, col | ||
from pyspark_cdm.catalog import Catalog | ||
from typing import Optional | ||
from .utils import first_non_empty_values | ||
|
||
DATE_FORMATS = { | ||
None: r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,}Z", # yyyy-MM-dd'T'HH:mm:ss.SSSZ doesn't work, pyspark format should be empty | ||
"M/d/yyyy h:mm:ss a": r"\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2} [AP]M", | ||
"yyyy-MM-dd'T'HH:mm:ss.SSS": r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}", | ||
"yyyy-MM-dd'T'HH:mm:ss'Z'": r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z", | ||
"yyyy-MM-dd'T'HH:mm:ss": r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", | ||
"dd-MM-yyyy HH:mm:ss": r"\d{2}-(0[1-9]|1[0-2])-\d{4} \d{2}:\d{2}:\d{2}", | ||
"dd-MM-yyyy": r"\d{2}-(0[1-9]|1[0-2])-\d{4}", | ||
"MM-dd-yyyy HH:mm:ss": r"(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])-\d{4} \d{2}:\d{2}:\d{2}", | ||
"MM-dd-yyyy": r"(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])-\d{4}", | ||
} | ||
|
||
class DatetimeParser: | ||
def __init__(self, df: DataFrame, catalog: Catalog) -> None: | ||
self.df = df | ||
self.catalog = catalog | ||
|
||
def detect_date_format(self, date_string: str) -> Optional[str]: | ||
""" | ||
Tries to find a matching regex pattern on the provided date_string. If it | ||
matches it returns the corresponding pyspark_format. Otherwise, it raises | ||
an exception. | ||
""" | ||
for pyspark_format, regex in DATE_FORMATS.items(): | ||
# The regex match fails if the provided date_string is a none value, therefore, | ||
# we return none once we stumble upon a none value. | ||
if date_string == None: | ||
return None | ||
|
||
if re.match(regex, date_string): | ||
return pyspark_format | ||
|
||
raise Exception(f"Cant find a matching datetime pattern for {date_string}") | ||
|
||
def try_parsing_datetime_column( | ||
self, | ||
df: DataFrame, | ||
column_name: str, | ||
datetime_format: str, | ||
) -> DataFrame: | ||
""" | ||
Convert the a single datetime column using the to_timestamp method | ||
with the required datetime format (e.g. "M/d/yyyy hh:mm:ss a"). | ||
""" | ||
try: | ||
df = df.withColumn(column_name, to_timestamp(col(column_name), datetime_format)) | ||
return df | ||
|
||
except: | ||
print(f'Failed parsing {column_name} with {datetime_format}') | ||
return df | ||
|
||
def convert_datetime_columns( | ||
self, | ||
) -> DataFrame: | ||
""" | ||
Loops over all the timestamp related columns and transforms these from strings | ||
into datetime objects. | ||
""" | ||
# Get for all the timestamp columns the first non empty value | ||
sampled_values = first_non_empty_values( | ||
self.df, | ||
self.catalog.timestamp_columns, | ||
) | ||
|
||
# Loop over all the timestamp columns and convert it into a datetime column | ||
df_parsed = self.df | ||
|
||
for column_name in self.catalog.timestamp_columns: | ||
pyspark_format = self.detect_date_format(sampled_values[column_name]) | ||
|
||
df_parsed = self.try_parsing_datetime_column( | ||
df_parsed, | ||
column_name, | ||
pyspark_format | ||
) | ||
|
||
return df_parsed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import pytest | ||
from pyspark.sql import SparkSession | ||
from pyspark_cdm.datetime_parser import DatetimeParser | ||
from pyspark_cdm.entity import Entity | ||
|
||
@pytest.fixture | ||
def datetime_parser(spark: SparkSession, entity: Entity): | ||
df = entity.get_dataframe(spark, True) | ||
catalog = entity.catalog | ||
|
||
datetime_parser = DatetimeParser( | ||
df=df, | ||
catalog=catalog, | ||
) | ||
|
||
return datetime_parser |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
7282fed7-c82c-eb11-a813-000d3a3a75c1,"6/12/2023 12:03:02 PM","6/12/2023 12:03:02 PM",6b82fed7-c82c-eb11-a813-000d3a3a75c1,7a7bfed7-c82c-eb11-a813-000d3a3a75c1,7282fed7-c82c-eb11-a813-000d3a3a75c1,56635,False,"0001-01-01T00:00:00.0000000" | ||
32b07058-d02c-eb11-a813-000d3a3a75c1,"6/12/2023 12:03:02 PM","6/12/2023 12:03:02 PM",2ab07058-d02c-eb11-a813-000d3a3a75c1,7a7bfed7-c82c-eb11-a813-000d3a3a75c1,32b07058-d02c-eb11-a813-000d3a3a75c1,393639,False,"0001-01-01T00:00:00.0000000" | ||
fd11d7fa-d42c-eb11-a813-000d3a3a75c1,"6/12/2023 12:03:02 PM","6/12/2023 12:03:02 PM",f511d7fa-d42c-eb11-a813-000d3a3a75c1,7a7bfed7-c82c-eb11-a813-000d3a3a75c1,fd11d7fa-d42c-eb11-a813-000d3a3a75c1,542839,False,"0001-01-01T00:00:00.0000000" | ||
7282fed7-c82c-eb11-a813-000d3a3a75c1,"6/15/2023 12:03:02 PM","6/12/2023 12:03:02 PM",6b82fed7-c82c-eb11-a813-000d3a3a75c1,7a7bfed7-c82c-eb11-a813-000d3a3a75c1,7282fed7-c82c-eb11-a813-000d3a3a75c1,56635,False,"0001-01-01T00:00:00.0000000","2023-12-01T08:25:39.0000000+00:00","2024-01-30T13:24:06Z","20-12-2023","20-12-2023 15:45:03","2022-09-30T07:34:52.5348484Z","2022-03-22T13:40:11.0000000","20-12-2023","2022-05-05T14:38:48.8547924Z","2022-03-12T15:50:11.0000000" | ||
32b07058-d02c-eb11-a813-000d3a3a75c1,"6/15/2023 12:03:02 PM","6/12/2023 12:03:02 PM",2ab07058-d02c-eb11-a813-000d3a3a75c1,7a7bfed7-c82c-eb11-a813-000d3a3a75c1,32b07058-d02c-eb11-a813-000d3a3a75c1,393639,False,"0001-01-01T00:00:00.0000000","2023-12-01T08:25:39.0000000+00:00","2024-01-30T13:24:06Z","20-12-2023","20-12-2023 15:45:03","2022-09-30T07:34:52.5348484Z","2022-03-22T13:40:11.0000000","20-12-2023","2022-05-05T14:38:48.8547924Z","2022-03-12T15:50:11.0000000" | ||
fd11d7fa-d42c-eb11-a813-000d3a3a75c1,"6/15/2023 12:03:02 PM","6/12/2023 12:03:02 PM",f511d7fa-d42c-eb11-a813-000d3a3a75c1,7a7bfed7-c82c-eb11-a813-000d3a3a75c1,fd11d7fa-d42c-eb11-a813-000d3a3a75c1,542839,False,"0001-01-01T00:00:00.0000000","2023-12-01T08:25:39.0000000+00:00","2024-01-30T13:24:06Z","20-12-2023","20-12-2023 15:45:03","2022-09-30T07:34:52.5348484Z","2022-03-22T13:40:11.0000000","20-12-2023","2022-05-05T14:38:48.8547924Z","2022-03-12T15:50:11.0000000" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.