Skip to content

Commit

Permalink
Merge pull request #320 from GSA/iis-date-parser
Browse files Browse the repository at this point in the history
add more date parser for IIS server
  • Loading branch information
amercader authored Nov 10, 2023
2 parents fe13b30 + 0beefa2 commit 7350dbd
Showing 1 changed file with 22 additions and 11 deletions.
33 changes: 22 additions & 11 deletions ckanext/spatial/harvesters/waf.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,17 +244,26 @@ def fetch_stage(self, harvest_object):
,adjacent=False, joinString=' ').setResultsName('date')
)

iis = parse.SkipTo("<br>").suppress() \
+ parse.OneOrMore("<br>").suppress() \
+ parse.Optional(parse.Combine(
parse.Word(parse.alphanums+'/') +
parse.Word(parse.alphanums+':') +
parse.Word(parse.alphas)
, adjacent=False, joinString=' ').setResultsName('date')
) \
+ parse.Word(parse.nums).suppress() \
+ parse.Literal('<A HREF=').suppress() \
+ parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
iis = parse.SkipTo("<br>").suppress() \
+ parse.OneOrMore("<br>").suppress() \
+ parse.Optional(parse.Combine(
parse.Word(parse.alphanums+'/') +
parse.Word(parse.alphanums+':') +
parse.Word(parse.alphas)
, adjacent=False, joinString=' ').setResultsName('date')
) \
+ parse.Optional(parse.Combine(
parse.Word(parse.alphas+',') +
parse.Word(parse.alphas) +
parse.Word(parse.nums+',') +
parse.Word(parse.nums) +
parse.Word(parse.nums+':') +
parse.Word(parse.alphas)
, adjacent=False, joinString=' ').setResultsName('date')
) \
+ parse.Word(parse.nums).suppress() \
+ parse.Literal('<A HREF=').suppress() \
+ parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')

other = parse.SkipTo(parse.CaselessLiteral("<a href="), include=True).suppress() \
+ parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
Expand Down Expand Up @@ -328,6 +337,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
except Exception as e:
raise
date = None
if not date:
log.debug('failed to get date for %s', url)
results.append((urljoin(base_url, record.url), date))

return results

0 comments on commit 7350dbd

Please sign in to comment.