Skip to content

Commit b77b995

Browse files
committed
feat: more rebust copyright parsing.
indents to fix nasa#126 Signed-off-by: rooot <[email protected]>
1 parent e7bb0a4 commit b77b995

File tree

1 file changed

+52
-5
lines changed

1 file changed

+52
-5
lines changed

apod/utility.py

+52-5
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ def _copyright(soup):
179179
for element in soup.findAll('a', text=True):
180180
# LOG.debug("TEXT: "+element.text)
181181

182+
# TODO: this breaks for APODs like 2024-11-09, where it credits "Voyager" instead of "Voyager 2"
182183
if use_next:
183184
copyright_text = element.text.strip(' ')
184185
break
@@ -188,23 +189,69 @@ def _copyright(soup):
188189
use_next = True
189190

190191
if not copyright_text:
192+
LOG.debug("didn't find copyright using first method!")
191193

192194
for element in soup.findAll(['b', 'a'], text=True):
193195
# search text for explicit match
194-
if 'Copyright' in element.text:
195-
LOG.debug('Found Copyright text:' + str(element.text))
196+
if 'Copyright' in element.text or 'Image Credit' in element.text:
197+
LOG.debug('Found Potential Copyright text:' + str(element.text))
196198
# pull the copyright from the link text which follows
197199
sibling = element.next_sibling
198200
stuff = ""
201+
202+
# these are used for checking when to add attribution
203+
# if an image contains no direct copyright/license mentions AND credits NASA,
204+
# we can assume the image is public domain - if not, we add the attribution
205+
found_license_mention = False
206+
found_nasa_credit = False
207+
199208
while sibling:
200209
try:
201-
stuff = stuff + sibling.text
202-
except Exception:
210+
# clean up the text a bit and get rid of double spaces
211+
sibling_text = sibling.text.replace('\n', ' ').replace(' ', ' ')
212+
213+
# LOG.debug("!!! adding1: |" + sibling_text + "|")
214+
stuff = stuff + sibling_text
215+
216+
if sibling_text.lower().strip(' ') == "nasa":
217+
found_nasa_credit = True
218+
LOG.debug(">> found NASA credit!")
219+
220+
# handle edge cases for licenses and copyright. might not work for all cases yet
221+
if "license" in sibling_text.lower() or "copyright" in sibling_text.lower():
222+
LOG.debug(">> found license mention!")
223+
found_license_mention = True
224+
for link in sibling.findAll('a', text=True):
225+
LOG.debug("LINK:" + str(link))
226+
227+
if "license" in link.text.lower() or "copyright" in link.text.lower():
228+
LOG.debug("License link: |" + str(link) + "| from |" + str(sibling_text) + "|")
229+
LOG.debug("stuff before: |" + stuff + "|")
230+
231+
# adding license link - clean up the URL and text, just in case
232+
clean_link = link["href"].strip('\n').strip(' ')
233+
license = clean_link + " " + link.text.strip('\n').strip(' ')
234+
LOG.debug("license info:" + license)
235+
# make license prettier if we can by checking for the type of license
236+
# todo: add more licenses, maybe?
237+
if "creativecommons.org/licenses/by/2.0" in license:
238+
license = "CC-BY-2.0"
239+
240+
LOG.debug("!!! adding: |" + license + "|")
241+
stuff = stuff + " " + license
242+
243+
except Exception as ex:
244+
LOG.warning("exception in copyright handler (sibling): " + str(ex))
203245
pass
204246
sibling = sibling.next_sibling
205247

206248
if stuff:
207-
copyright_text = stuff.strip(' ')
249+
if not found_license_mention and found_nasa_credit:
250+
LOG.debug("image is likely public domain - explicit NASA credit and no license/copyright mentions found")
251+
copyright_text = None
252+
else:
253+
# LOG.debug("found license or copyright")
254+
copyright_text = stuff.strip(' ').replace(' ', ' ')
208255
try:
209256
copyright_text = copyright_text.encode('latin1').decode('cp1252')
210257
except Exception as ex:

0 commit comments

Comments
 (0)