@@ -179,6 +179,7 @@ def _copyright(soup):
179
179
for element in soup .findAll ('a' , text = True ):
180
180
# LOG.debug("TEXT: "+element.text)
181
181
182
+ # TODO: this breaks for APODs like 2024-11-09, where it credits "Voyager" instead of "Voyager 2"
182
183
if use_next :
183
184
copyright_text = element .text .strip (' ' )
184
185
break
@@ -188,23 +189,69 @@ def _copyright(soup):
188
189
use_next = True
189
190
190
191
if not copyright_text :
192
+ LOG .debug ("didn't find copyright using first method!" )
191
193
192
194
for element in soup .findAll (['b' , 'a' ], text = True ):
193
195
# search text for explicit match
194
- if 'Copyright' in element .text :
195
- LOG .debug ('Found Copyright text:' + str (element .text ))
196
+ if 'Copyright' in element .text or 'Image Credit' in element . text :
197
+ LOG .debug ('Found Potential Copyright text:' + str (element .text ))
196
198
# pull the copyright from the link text which follows
197
199
sibling = element .next_sibling
198
200
stuff = ""
201
+
202
+ # these are used for checking when to add attribution
203
+ # if an image contains no direct copyright/license mentions AND credits NASA,
204
+ # we can assume the image is public domain - if not, we add the attribution
205
+ found_license_mention = False
206
+ found_nasa_credit = False
207
+
199
208
while sibling :
200
209
try :
201
- stuff = stuff + sibling .text
202
- except Exception :
210
+ # clean up the text a bit and get rid of double spaces
211
+ sibling_text = sibling .text .replace ('\n ' , ' ' ).replace (' ' , ' ' )
212
+
213
+ # LOG.debug("!!! adding1: |" + sibling_text + "|")
214
+ stuff = stuff + sibling_text
215
+
216
+ if sibling_text .lower ().strip (' ' ) == "nasa" :
217
+ found_nasa_credit = True
218
+ LOG .debug (">> found NASA credit!" )
219
+
220
+ # handle edge cases for licenses and copyright. might not work for all cases yet
221
+ if "license" in sibling_text .lower () or "copyright" in sibling_text .lower ():
222
+ LOG .debug (">> found license mention!" )
223
+ found_license_mention = True
224
+ for link in sibling .findAll ('a' , text = True ):
225
+ LOG .debug ("LINK:" + str (link ))
226
+
227
+ if "license" in link .text .lower () or "copyright" in link .text .lower ():
228
+ LOG .debug ("License link: |" + str (link ) + "| from |" + str (sibling_text ) + "|" )
229
+ LOG .debug ("stuff before: |" + stuff + "|" )
230
+
231
+ # adding license link - clean up the URL and text, just in case
232
+ clean_link = link ["href" ].strip ('\n ' ).strip (' ' )
233
+ license = clean_link + " " + link .text .strip ('\n ' ).strip (' ' )
234
+ LOG .debug ("license info:" + license )
235
+ # make license prettier if we can by checking for the type of license
236
+ # todo: add more licenses, maybe?
237
+ if "creativecommons.org/licenses/by/2.0" in license :
238
+ license = "CC-BY-2.0"
239
+
240
+ LOG .debug ("!!! adding: |" + license + "|" )
241
+ stuff = stuff + " " + license
242
+
243
+ except Exception as ex :
244
+ LOG .warning ("exception in copyright handler (sibling): " + str (ex ))
203
245
pass
204
246
sibling = sibling .next_sibling
205
247
206
248
if stuff :
207
- copyright_text = stuff .strip (' ' )
249
+ if not found_license_mention and found_nasa_credit :
250
+ LOG .debug ("image is likely public domain - explicit NASA credit and no license/copyright mentions found" )
251
+ copyright_text = None
252
+ else :
253
+ # LOG.debug("found license or copyright")
254
+ copyright_text = stuff .strip (' ' ).replace (' ' , ' ' )
208
255
try :
209
256
copyright_text = copyright_text .encode ('latin1' ).decode ('cp1252' )
210
257
except Exception as ex :
0 commit comments