Skip to content

Commit

Permalink
Merge pull request #9 from EJOOSTEROP:feat_improve_missing_date_logic
Browse files Browse the repository at this point in the history
Date functionality optimizations.
  • Loading branch information
EJOOSTEROP authored Mar 3, 2024
2 parents c9705ea + 3d678c2 commit f0a60f9
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 15 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
<!-- insertion marker -->
## [Unreleased]

## [0.4.5] 2024-03-02
### Changed
- Improve query parameters to identify dates for which data exists
- Make some date logic more efficient

### Added
- inline test for date functionality

## [0.4.4] 2024-03-02
### Added
- Added logging of dlt job to sentry.io. Need to specify env variable
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ disallow_incomplete_defs = true

[tool.poetry]
name = "ternyxmimosa"
version = "0.4.4"
version = "0.4.5"
description = "A minimal modern data stack with working data pipelines in a single Docker container."
authors = ["Erik Oosterop <[email protected]>"]
license = "MIT"
Expand Down
65 changes: 52 additions & 13 deletions src/mimosa/dateswithoutdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,13 @@ def get_existing_dates_as_integer(start_dt=date(2018, 1, 1), end_dt=None):
# connect to MotherDuck, string slighly different than for dlt hence the replace
conn_str = os.environ["DESTINATION__MOTHERDUCK__CREDENTIALS"].replace("/", "")
con = duckdb.connect(conn_str)
query = "select distinct gas_day_start from landing.storage where gas_day_start >= ? and gas_day_start <= ? order by 1 asc"
query = "select distinct gas_day_start from landing.storage where gas_day_start >= $start_date and gas_day_start <= $end_date order by 1 asc"
result = con.execute(
query, [start_dt.strftime("%Y-%m-%d"), end_dt.strftime("%Y-%m-%d")]
query,
{
"start_date": start_dt.strftime("%Y-%m-%d"),
"end_date": end_dt.strftime("%Y-%m-%d"),
},
).fetchall()

# convert results to list of integers. Only consider first column.
Expand All @@ -74,23 +78,34 @@ def get_existing_dates_as_integer(start_dt=date(2018, 1, 1), end_dt=None):
for date in result
]

if len(dates_list) == 0:
dates_list.insert(0, date_to_integer(start_dt) - 1)
dates_list.append(date_to_integer(end_dt) + 1)
if not dates_list:
dates_list.extend([date_to_integer(start_dt) - 1, date_to_integer(end_dt) + 1])
else:
min_date = date_to_integer(start_dt)
min_date, max_date = date_to_integer(start_dt), date_to_integer(end_dt)
if dates_list[0] > min_date:
dates_list.insert(0, min_date - 1)

max_date = date_to_integer(end_dt)
if dates_list[-1] < max_date:
dates_list.append(max_date + 1)

return dates_list


def get_missing_dates(existing_dates_as_integer):
"""Takes a list of existing dates as input and returns a list of missing dates."""
"""Takes a list of existing dates as input and returns a list of missing dates.
>>> print(get_missing_dates([1, 3]))
[datetime.date(1900, 1, 3)]
>>> print(get_missing_dates([1, 3, 4, 7]))
[datetime.date(1900, 1, 3), datetime.date(1900, 1, 6), datetime.date(1900, 1, 7)]
>>> print(get_missing_dates([1, 2, 3]))
[]
>>> print(get_missing_dates([]))
[]
"""
return [
integer_to_date(row) for row in tern.missing_elements(existing_dates_as_integer)
]
Expand All @@ -108,12 +123,23 @@ def get_sequence_ranges(sequence):
Returns:
List[Tuple[int, int]]: The list of tuples containing the first and last value of each sequence.
>>> print(get_sequence_ranges([]))
[]
>>> print(get_sequence_ranges([2]))
[(2, 3)]
>>> print(get_sequence_ranges([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]))
[(1, 2), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12), (13, 14), (15, 16), (17, 18), (19, 20)]
>>> print(get_sequence_ranges([1, 2, 3, 5, 7, 8, 9, 11]))
[(1, 4), (5, 6), (7, 10), (11, 12)]
"""
if len(sequence) < 1:
ranges = []
else:
ranges = []
if len(sequence) > 0:
offset = 1
ranges = []
start = sequence[0]
for i in sequence:
prior = sequence[sequence.index(i) - 1]
Expand All @@ -134,6 +160,13 @@ def convert_integer_tuples_to_date_tuples(integer_tuples):
Returns:
List[Tuple[object, object]]: The list of data tuples containing the converted values.
>>> print(convert_integer_tuples_to_date_tuples([(1, 3), (4, 7)]))
[(datetime.date(1900, 1, 2), datetime.date(1900, 1, 4)), (datetime.date(1900, 1, 5), datetime.date(1900, 1, 8))]
>>> print(convert_integer_tuples_to_date_tuples([]))
[]
"""
return [(integer_to_date(x), integer_to_date(y)) for x, y in integer_tuples]

Expand All @@ -154,3 +187,9 @@ def tuples_of_missing_dates(start_dt=date(2018, 1, 1), end_dt=None):
)
]
return convert_integer_tuples_to_date_tuples(get_sequence_ranges(missing_integers))


if __name__ == "__main__":
import doctest

doctest.testmod()
11 changes: 11 additions & 0 deletions src/mimosa/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,18 @@ def missing_elements(li):
Given a list li, this function finds the missing elements in the range
from the first to the last element of the list and returns
them as a sorted list.
>>> print(missing_elements([1, 2, 3, 5, 7, 8, 9, 11]))
[4, 6, 10]
>>> print(missing_elements([1]))
[]
>>> print(missing_elements([]))
[]
"""
if not li:
li = [1]
start, end = li[0], li[-1]
return sorted(set(range(start, end + 1)).difference(li))

Expand Down
4 changes: 3 additions & 1 deletion src/mimosa/wip_missing_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
pipeline = GEI(destination=destination)
reporting_update = False

for seq in dwd.tuples_of_missing_dates(start_dt=date(2022, 1, 15), end_dt=None):
for seq in dwd.tuples_of_missing_dates(
start_dt=date(2018, 1, 1), end_dt=date(2019, 1, 2)
):
pipeline.run_landing_pipeline(
gas_date=seq[0],
to_gas_date=seq[1],
Expand Down

0 comments on commit f0a60f9

Please sign in to comment.