Skip to content

Commit

Permalink
Fix issues with SF ContentDocument restrictions (#13)
Browse files Browse the repository at this point in the history
* Fix issues with SF ContentDocument restrictions

Workaround which fixes issues with SF malformed queries which worked previously. Errors that were occurring:

> Implementation restriction: ContentDocumentLink requires a filter by a single Id on ContentDocumentId or LinkedEntityId using the equals operator or multiple Id's using the IN operator.

> Implementation restriction: filtering on non-id fields is only permitted when filtering by ContentDocumentLink.LinkedEntityId using the equals operator.

* Update tests for salesforce.py
  • Loading branch information
piotrekkr authored Sep 11, 2024
1 parent 1c07e48 commit aeb6185
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 35 deletions.
32 changes: 24 additions & 8 deletions src/salesforce_archivist/salesforce/salesforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,37 @@ def _init_tmp_dir(self) -> str:
return tmp_dir

def _get_content_document_list_query(self) -> str:
select_list = ["LinkedEntityId", "ContentDocumentId"]
select_list = ["LinkedEntityId", "ContentDocumentId", "LinkedEntity.Type"]
if self._archivist_obj.dir_name_field is not None and self._archivist_obj.dir_name_field not in select_list:
select_list.append(self._archivist_obj.dir_name_field)
where_list = ["LinkedEntity.Type = '{obj_type}'".format(obj_type=self._archivist_obj.obj_type)]
where_conditions = []
if self._archivist_obj.modified_date_lt is not None:
where_list.append(
where_conditions.append(
"ContentDocument.ContentModifiedDate < {date}".format(
date=self._archivist_obj.modified_date_lt.strftime("%Y-%m-%dT%H:%M:%SZ")
)
)
if self._archivist_obj.modified_date_gt is not None:
where_list.append(
where_conditions.append(
"ContentDocument.ContentModifiedDate > {date}".format(
date=self._archivist_obj.modified_date_gt.strftime("%Y-%m-%dT%H:%M:%SZ")
)
)
return "SELECT {fields} FROM ContentDocumentLink WHERE {where}".format(
fields=", ".join(select_list), where=" AND ".join(where_list)
)
where = ""
if len(where_conditions):
where = "WHERE {}".format(" AND ".join(where_conditions))
# Using WHERE IN and not using filter on `LinkedEntity.Type` is done because of SF restrictions like:
#
# Implementation restriction: ContentDocumentLink requires a filter by a single Id on ContentDocumentId
# or LinkedEntityId using the equals operator or multiple Id's using the IN operator.
#
# Implementation restriction: filtering on non-id fields is only permitted when filtering
# by ContentDocumentLink.LinkedEntityId using the equals operator.

return (
"SELECT {fields} FROM ContentDocumentLink "
"WHERE ContentDocumentId IN (SELECT Id FROM ContentDocument {where})"
).format(fields=", ".join(select_list), where=where)

def download_content_document_link_list(
self,
Expand All @@ -81,10 +93,14 @@ def download_content_document_link_list(
reader = csv.reader(file)
next(reader)
for row in reader:
# If type is not the same as the object type, skip.
# This is a workaround for restriction on ContentDocumentLink filtering directly in query.
if row[2] != self._archivist_obj.obj_type:
continue
link = ContentDocumentLink(
linked_entity_id=row[0],
content_document_id=row[1],
download_dir_name=row[2] if self._archivist_obj.dir_name_field is not None else None,
download_dir_name=row[3] if self._archivist_obj.dir_name_field is not None else None,
)
document_link_list.add_link(link)

Expand Down
53 changes: 26 additions & 27 deletions test/salesforce/test_salesforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
None,
None,
None,
"SELECT LinkedEntityId, ContentDocumentId FROM ContentDocumentLink WHERE LinkedEntity.Type = 'User'",
"SELECT LinkedEntityId, ContentDocumentId, LinkedEntity.Type FROM ContentDocumentLink WHERE ContentDocumentId IN (SELECT Id FROM ContentDocument )",
),
(
datetime(
Expand All @@ -38,10 +38,9 @@
None,
None,
(
"SELECT LinkedEntityId, ContentDocumentId "
"SELECT LinkedEntityId, ContentDocumentId, LinkedEntity.Type "
"FROM ContentDocumentLink "
"WHERE LinkedEntity.Type = 'User' "
"AND ContentDocument.ContentModifiedDate < 2024-01-01T00:00:00Z"
"WHERE ContentDocumentId IN (SELECT Id FROM ContentDocument WHERE ContentDocument.ContentModifiedDate < 2024-01-01T00:00:00Z)"
),
),
(
Expand All @@ -67,11 +66,11 @@
),
None,
(
"SELECT LinkedEntityId, ContentDocumentId "
"SELECT LinkedEntityId, ContentDocumentId, LinkedEntity.Type "
"FROM ContentDocumentLink "
"WHERE LinkedEntity.Type = 'User' "
"AND ContentDocument.ContentModifiedDate < 2024-01-01T00:00:00Z "
"AND ContentDocument.ContentModifiedDate > 2023-01-01T00:00:00Z"
"WHERE ContentDocumentId IN ("
"SELECT Id FROM ContentDocument WHERE ContentDocument.ContentModifiedDate < 2024-01-01T00:00:00Z AND ContentDocument.ContentModifiedDate > 2023-01-01T00:00:00Z"
")"
),
),
(
Expand All @@ -97,11 +96,11 @@
),
"DirField",
(
"SELECT LinkedEntityId, ContentDocumentId, DirField "
"SELECT LinkedEntityId, ContentDocumentId, LinkedEntity.Type, DirField "
"FROM ContentDocumentLink "
"WHERE LinkedEntity.Type = 'User' "
"AND ContentDocument.ContentModifiedDate < 2024-01-01T00:00:00Z "
"AND ContentDocument.ContentModifiedDate > 2023-01-01T00:00:00Z"
"WHERE ContentDocumentId IN ("
"SELECT Id FROM ContentDocument WHERE ContentDocument.ContentModifiedDate < 2024-01-01T00:00:00Z AND ContentDocument.ContentModifiedDate > 2023-01-01T00:00:00Z"
")"
),
),
],
Expand Down Expand Up @@ -141,35 +140,35 @@ def test_download_content_document_link_list_queries(
[],
# no results from query (file with only header)
[
[["LinkedEntityId", "ContentDocumentId"]],
[["LinkedEntityId", "ContentDocumentId", "Type"]],
],
# results without custom field for dir name
[
[
["LinkedEntityId", "ContentDocumentId"],
["LinkedEntityId_1", "ContentDocumentId_1"],
["LinkedEntityId_2", "ContentDocumentId_2"],
["LinkedEntityId", "ContentDocumentId", "User"],
["LinkedEntityId_1", "ContentDocumentId_1", "User"],
["LinkedEntityId_2", "ContentDocumentId_2", "User"],
]
],
# results with custom field for dir name
[
[
["LinkedEntityId", "ContentDocumentId", "CustomFieldForDirName"],
["LinkedEntityId_1", "ContentDocumentId_1", "CustomFieldForDirName_1"],
["LinkedEntityId_2", "ContentDocumentId_2", "CustomFieldForDirName_2"],
["LinkedEntityId", "ContentDocumentId", "User", "CustomFieldForDirName"],
["LinkedEntityId_1", "ContentDocumentId_1", "User", "CustomFieldForDirName_1"],
["LinkedEntityId_2", "ContentDocumentId_2", "User", "CustomFieldForDirName_2"],
]
],
# results with custom field for dir name in multiple csv files
[
[
["LinkedEntityId", "ContentDocumentId", "CustomFieldForDirName"],
["LinkedEntityId_1", "ContentDocumentId_1", "CustomFieldForDirName_1"],
["LinkedEntityId_2", "ContentDocumentId_2", "CustomFieldForDirName_2"],
["LinkedEntityId", "ContentDocumentId", "User", "CustomFieldForDirName"],
["LinkedEntityId_1", "ContentDocumentId_1", "User", "CustomFieldForDirName_1"],
["LinkedEntityId_2", "ContentDocumentId_2", "User", "CustomFieldForDirName_2"],
],
[
["LinkedEntityId", "ContentDocumentId", "CustomFieldForDirName"],
["LinkedEntityId_3", "ContentDocumentId_3", "CustomFieldForDirName_3"],
["LinkedEntityId_4", "ContentDocumentId_4", "CustomFieldForDirName_4"],
["LinkedEntityId", "ContentDocumentId", "User", "CustomFieldForDirName"],
["LinkedEntityId_3", "ContentDocumentId_3", "User", "CustomFieldForDirName_3"],
["LinkedEntityId_4", "ContentDocumentId_4", "User", "CustomFieldForDirName_4"],
],
],
],
Expand All @@ -182,7 +181,7 @@ def test_download_content_document_link_list_csv_reading(
archivist_obj = ArchivistObject(
data_dir=tmp_dir,
obj_type="User",
dir_name_field=(csv_files_data[0][0][2] if len(csv_files_data) and len(csv_files_data[0][0]) > 2 else None),
dir_name_field=(csv_files_data[0][0][3] if len(csv_files_data) and len(csv_files_data[0][0]) > 3 else None),
)
client.bulk2 = Mock(
side_effect=lambda *args, **kwargs: gen_temp_csv_files(
Expand All @@ -196,7 +195,7 @@ def test_download_content_document_link_list_csv_reading(
doc_link = ContentDocumentLink(
linked_entity_id=row[0],
content_document_id=row[1],
download_dir_name=row[2] if len(row) > 2 else row[0],
download_dir_name=row[3] if len(row) > 3 else row[0],
)
add_link_calls.append(call(doc_link))

Expand Down

0 comments on commit aeb6185

Please sign in to comment.