DDG: Add debug prints in the parser.

This commit is contained in:
Valentin Lorentz 2020-07-01 10:34:13 +02:00
parent 59f1441d23
commit 427cf82d6b

View File

@ -32,6 +32,12 @@ import enum
import collections
from html.parser import HTMLParser
DEBUG = False
def debug(msg, *args):
if DEBUG:
print(msg % args)
result = collections.namedtuple('result', 'link title snippet')
@enum.unique
@ -62,6 +68,7 @@ class DDGHTMLParser(HTMLParser):
classes = attrs.get('class', '').split()
if tag in STACKED_TAGS:
debug('Stacking %s with classes %s', tag, classes)
self.stack.append((tag, classes))
if ('tr', ['result-sponsored']) in self.stack:
@ -69,17 +76,20 @@ class DDGHTMLParser(HTMLParser):
return
if tag == 'a' and 'result-link' in classes:
debug('Got result-link')
assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
self.state = ParserState.IN_TITLE
self.current_link = attrs['href']
self.current_title = []
elif tag == 'td' and 'result-snippet' in classes:
debug('Got result-snipper')
assert self.state == ParserState.TITLE_PARSED, self.state
self.state = ParserState.IN_SNIPPET
self.current_snippet = []
elif tag == 'span' and 'link-text' in classes:
debug('Got link-text')
if self.state == ParserState.TITLE_PARSED:
# No snippet
self.state = ParserState.OUTSIDE
@ -91,23 +101,29 @@ class DDGHTMLParser(HTMLParser):
assert item[0] == tag, (item, tag)
if tag == 'a' and self.state == ParserState.IN_TITLE:
debug('Title parsed')
self.state = ParserState.TITLE_PARSED
elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
debug('Snippet parsed')
self.build_result()
self.state = ParserState.OUTSIDE
def handle_data(self, data):
if self.state == ParserState.IN_TITLE:
debug('Got title data: %s', data)
self.current_title.append(data)
elif self.state == ParserState.IN_SNIPPET:
debug('Got snippet data: %s', data)
self.current_snippet.append(data)
def build_result(self):
self.results.append(result(
res = result(
link=self.current_link,
title=''.join(self.current_title),
snippet=''.join(self.current_snippet),
))
)
debug('Finished parsing result: %r', res)
self.results.append(res)
self.reset_current_result()
if __name__ == '__main__':