Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

entry_dedupe: fix flip-flopping between duplicate new entries. #340 #343

Merged
merged 2 commits into from
Jun 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ Unreleased
:class:`EntryCounts` and :class:`EntrySearchCounts`.
Thanks to `chenthur`_ for the pull request.
(:issue:`283`)
* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing entries to flip-flop
if there were multiple *new* duplicates of the same issue
(on the first update, one entry remains, on the second update, the other);
related to the bug fixed in `version 3.2 <Version 3.2_>`_.
(:issue:`340`)

.. _chenthur: https://github.com/chenthur

Expand Down Expand Up @@ -354,7 +359,7 @@ Released 2022-09-14
became optional.
(:issue:`96`)
* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing updates to fail
if there were multiple *new* duplicates of the same issue.
if there were multiple *new* duplicates of the same entry.
(:issue:`292`)
* Fix bug in :mod:`~reader.plugins.readtime`
and :mod:`~reader.plugins.mark_as_read` causing updates to fail
Expand Down
42 changes: 35 additions & 7 deletions src/reader/plugins/entry_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,15 @@
from collections import Counter
from collections import defaultdict
from collections import deque
from datetime import datetime
from datetime import timezone
from itertools import groupby
from typing import NamedTuple

from reader._utils import BetterStrPartial as partial
from reader.exceptions import EntryNotFoundError
from reader.exceptions import TagNotFoundError
from reader.types import EntryUpdateStatus
from reader.types import Feed


log = logging.getLogger('reader.plugins.entry_dedupe')
Expand Down Expand Up @@ -260,11 +261,41 @@ def _after_entry_update(reader, entry, status, *, dry_run=False):
if _is_duplicate_full(entry, e)
]
if not duplicates:
log.debug("entry_dedupe: no duplicates for %s", entry.resource_id)
return

try:
entry = reader.get_entry(entry)
except EntryNotFoundError:
log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)
return

def group_key(e):
# unlike _get_entry_groups, we cannot rely on e.last_updated,
# because for duplicates in the feed, we'd end up flip-flopping
# (on the first update, entry 1 is deleted and entry 2 remains;
# on the second update, entry 1 remains because it's new,
# and entry 2 is deleted because it's not modified,
# has lower last_updated, and no update hook runs for it; repeat).
#
# it would be more correct to sort by (is in new feed, last_retrieved),
# but as of 3.14, we don't know about existing but not modified entries
# (the hook isn't called), and entries don't have last_retrieved.
#
# also see test_duplicates_in_feed / #340.
#
return e.updated or e.published or DEFAULT_UPDATED, e.id

group = [entry] + duplicates
group.sort(key=group_key, reverse=True)
entry, *duplicates = group

_dedupe_entries(reader, entry, duplicates, dry_run=dry_run)


DEFAULT_UPDATED = datetime(1970, 1, 1, tzinfo=timezone.utc)


def _get_same_group_entries(reader, entry):
# to make this better, we could do something like
# reader.search_entries(f'title: {fts5_escape(entry.title)}'),
Expand Down Expand Up @@ -347,6 +378,7 @@ def by_title(e):
continue

while group:
# keep the latest entry, consider the rest duplicates
group.sort(key=lambda e: e.last_updated, reverse=True)
entry, *rest = group

Expand Down Expand Up @@ -496,10 +528,6 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
[e.id for e in duplicates],
)

# in case entry is EntryData, not Entry
if hasattr(entry, 'as_entry'):
entry = entry.as_entry(feed=Feed(entry.feed_url))

# don't do anything until we know all actions were generated successfully
actions = list(_make_actions(reader, entry, duplicates))
# FIXME: what if this fails with EntryNotFoundError?
Expand All @@ -510,8 +538,8 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
for action in actions:
action()
log.info("entry_dedupe: %s", action)
except EntryNotFoundError as e:
if entry.resource_id != e.resource_id: # pragma: no cover
except EntryNotFoundError as e: # pragma: no cover
if entry.resource_id != e.resource_id:
raise
log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)

Expand Down
33 changes: 28 additions & 5 deletions tests/test_plugins_entry_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,12 +755,13 @@ def test_recent_sort_copying(make_reader, db_path):


@pytest.mark.parametrize('update_after_one', [False, True])
def test_duplicates_in_new_feed(make_reader, update_after_one):
@pytest.mark.parametrize('with_dates, expected_id', [(False, '3'), (True, '2')])
def test_duplicates_in_feed(make_reader, update_after_one, with_dates, expected_id):
reader = make_reader(':memory:', plugins=['reader.entry_dedupe'])
reader._parser = parser = Parser()

reader.add_feed(parser.feed(1))
one = parser.entry(1, 1, title='title', summary='summary')
one = parser.entry(1, '1', title='title', summary='summary')

if update_after_one:
reader.update_feeds()
Expand All @@ -769,10 +770,32 @@ def test_duplicates_in_new_feed(make_reader, update_after_one):
reader.mark_entry_as_important(one)
reader.set_tag(one, 'key', 'value')

parser.entry(1, 2, title='title', summary='summary')
parser.entry(1, 3, title='title', summary='summary')
parser.entry(
1,
'2',
title='title',
summary='summary',
updated=datetime(2010, 1, 2) if with_dates else None,
)
parser.entry(
1,
'3',
title='title',
summary='summary',
published=datetime(2010, 1, 1) if with_dates else None,
)

# shouldn't fail
reader.update_feeds()
assert {e.id for e in reader.get_entries()} == {expected_id}

assert [eval(e.id)[1] for e in reader.get_entries()] == [3]
# shouldn't flip flop
# https://github.com/lemon24/reader/issues/340
reader.update_feeds()
assert {e.id for e in reader.get_entries()} == {expected_id}

if update_after_one:
entry = reader.get_entry(('1', expected_id))
assert entry.read
assert entry.important
assert dict(reader.get_tags(entry)) == {'key': 'value'}