2020-05-15 21:36:31 +02:00
|
|
|
###
|
|
|
|
# Copyright (c) 2020, Valentin Lorentz
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
###
|
|
|
|
|
2020-05-15 21:26:44 +02:00
|
|
|
import enum
|
|
|
|
import collections
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
|
2020-07-01 10:34:13 +02:00
|
|
|
DEBUG = False
|
|
|
|
|
|
|
|
def debug(msg, *args):
|
|
|
|
if DEBUG:
|
|
|
|
print(msg % args)
|
|
|
|
|
2020-05-15 21:26:44 +02:00
|
|
|
result = collections.namedtuple('result', 'link title snippet')
|
|
|
|
|
|
|
|
@enum.unique
|
|
|
|
class ParserState(enum.Enum):
|
|
|
|
OUTSIDE = 0
|
|
|
|
IN_TITLE = 1
|
|
|
|
TITLE_PARSED = 2
|
|
|
|
IN_SNIPPET = 3
|
|
|
|
|
2020-07-01 20:05:12 +02:00
|
|
|
|
|
|
|
# This is implemented as a stack automaton. Here is the transition graph.
|
|
|
|
# See comments below to find the description of each transition
|
|
|
|
#
|
|
|
|
# --> OUTSIDE --(1)--> IN_TITLE --(2)--> TITLE_PARSED --(3)--> IN_SNIPPET
|
|
|
|
# ^ ^ | |
|
|
|
|
# | | | |
|
|
|
|
# | +----------------(5)---------------+ |
|
|
|
|
# | |
|
|
|
|
# +------------------------------(4)--------------------------+
|
|
|
|
|
|
|
|
|
2020-05-15 21:26:44 +02:00
|
|
|
STACKED_TAGS = ('table', 'tr', 'td', 'a')
|
|
|
|
|
|
|
|
class DDGHTMLParser(HTMLParser):
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
self.stack = []
|
|
|
|
self.results = []
|
|
|
|
|
|
|
|
self.reset_current_result()
|
|
|
|
|
|
|
|
def reset_current_result(self):
|
|
|
|
self.state = ParserState.OUTSIDE
|
|
|
|
self.current_link = None
|
|
|
|
self.current_title = None
|
|
|
|
self.current_snippet = None
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
attrs = dict(attrs)
|
|
|
|
classes = attrs.get('class', '').split()
|
|
|
|
|
|
|
|
if tag in STACKED_TAGS:
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Stacking %s with classes %s', tag, classes)
|
2020-05-15 21:26:44 +02:00
|
|
|
self.stack.append((tag, classes))
|
|
|
|
|
|
|
|
if ('tr', ['result-sponsored']) in self.stack:
|
|
|
|
# Skip sponsored results
|
|
|
|
return
|
|
|
|
|
|
|
|
if tag == 'a' and 'result-link' in classes:
|
2020-07-01 20:05:12 +02:00
|
|
|
# 1. Starts the title of a result; transition from OUTSIDE
|
|
|
|
# to IN_TITLE
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Got result-link')
|
2020-05-15 21:26:44 +02:00
|
|
|
assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
|
|
|
|
self.state = ParserState.IN_TITLE
|
|
|
|
self.current_link = attrs['href']
|
|
|
|
self.current_title = []
|
|
|
|
|
|
|
|
elif tag == 'td' and 'result-snippet' in classes:
|
2020-07-01 20:05:12 +02:00
|
|
|
# 3. Starts a snippet. Normally, just after a title ended.
|
|
|
|
# Transition from TITLE_PARSED to IN_SNIPPET
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Got result-snipper')
|
2020-05-15 21:26:44 +02:00
|
|
|
assert self.state == ParserState.TITLE_PARSED, self.state
|
|
|
|
self.state = ParserState.IN_SNIPPET
|
|
|
|
self.current_snippet = []
|
|
|
|
|
|
|
|
elif tag == 'span' and 'link-text' in classes:
|
2020-07-01 20:05:12 +02:00
|
|
|
# 5. This is the link, after a snippet if any. We're catching it
|
|
|
|
# detect results without a snippet. If so, transition directly
|
|
|
|
# from TITLE_PARSED to OUTSIDE
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Got link-text')
|
2020-05-15 21:26:44 +02:00
|
|
|
if self.state == ParserState.TITLE_PARSED:
|
|
|
|
# No snippet
|
|
|
|
self.state = ParserState.OUTSIDE
|
|
|
|
self.current_snippet = []
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag in STACKED_TAGS:
|
|
|
|
item = self.stack.pop()
|
|
|
|
assert item[0] == tag, (item, tag)
|
|
|
|
|
|
|
|
if tag == 'a' and self.state == ParserState.IN_TITLE:
|
2020-07-01 20:05:12 +02:00
|
|
|
# 2. End of the <a> node matched in step 1; transition from
|
|
|
|
# IN_TITLE to TITLE_PARSED
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Title parsed')
|
2020-05-15 21:26:44 +02:00
|
|
|
self.state = ParserState.TITLE_PARSED
|
|
|
|
elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
|
2020-07-01 20:05:12 +02:00
|
|
|
# 4. End of the <td> node matched in step 3, this concludes the
|
|
|
|
# parsing of this result. Transition from IN_SNIPPET to OUTSIDE
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Snippet parsed')
|
2020-05-15 21:26:44 +02:00
|
|
|
self.build_result()
|
|
|
|
self.state = ParserState.OUTSIDE
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
if self.state == ParserState.IN_TITLE:
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Got title data: %s', data)
|
2020-05-15 21:26:44 +02:00
|
|
|
self.current_title.append(data)
|
|
|
|
elif self.state == ParserState.IN_SNIPPET:
|
2020-07-01 10:34:13 +02:00
|
|
|
debug('Got snippet data: %s', data)
|
2020-05-15 21:26:44 +02:00
|
|
|
self.current_snippet.append(data)
|
|
|
|
|
|
|
|
def build_result(self):
|
2020-07-01 10:34:13 +02:00
|
|
|
res = result(
|
2020-05-15 21:26:44 +02:00
|
|
|
link=self.current_link,
|
|
|
|
title=''.join(self.current_title),
|
|
|
|
snippet=''.join(self.current_snippet),
|
2020-07-01 10:34:13 +02:00
|
|
|
)
|
|
|
|
debug('Finished parsing result: %r', res)
|
|
|
|
self.results.append(res)
|
2020-05-15 21:26:44 +02:00
|
|
|
self.reset_current_result()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = DDGHTMLParser()
|
|
|
|
with open('ddg2.html') as fd:
|
|
|
|
parser.feed(fd.read())
|
|
|
|
print(parser.results)
|