Limnoria/plugins/DDG/parser.py

###
# Copyright (c) 2020, Valentin Lorentz
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

###

import enum
import collections
from html.parser import HTMLParser

DEBUG = False

def debug(msg, *args):
    if DEBUG:
        print(msg % args)

result = collections.namedtuple('result', 'link title snippet')

@enum.unique
class ParserState(enum.Enum):
    OUTSIDE = 0
    IN_TITLE = 1
    TITLE_PARSED = 2
    IN_SNIPPET = 3


# This is implemented as a stack automaton. Here is the transition graph.
# See comments below to find the description of each transition
#
# --> OUTSIDE --(1)--> IN_TITLE --(2)--> TITLE_PARSED --(3)--> IN_SNIPPET
#       ^  ^                                  |                     |
#       |  |                                  |                     |
#       |  +----------------(5)---------------+                     |
#       |                                                           |
#       +------------------------------(4)--------------------------+


STACKED_TAGS = ('table', 'tr', 'td', 'a')

class DDGHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.stack = []
        self.results = []

        self.reset_current_result()

    def reset_current_result(self):
        self.state = ParserState.OUTSIDE
        self.current_link = None
        self.current_title = None
        self.current_snippet = None

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        classes = attrs.get('class', '').split()

        if tag in STACKED_TAGS:
            debug('Stacking %s with classes %s', tag, classes)
            self.stack.append((tag, classes))

        if ('tr', ['result-sponsored']) in self.stack:
            # Skip sponsored results
            return

        if tag == 'a' and 'result-link' in classes:
            # 1. Starts the title of a result; transition from OUTSIDE
            #    to IN_TITLE
            debug('Got result-link')
            assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
            self.state = ParserState.IN_TITLE
            self.current_link = attrs['href']
            self.current_title = []

        elif tag == 'td' and 'result-snippet' in classes:
            # 3. Starts a snippet. Normally, just after a title ended.
            #    Transition from TITLE_PARSED to IN_SNIPPET
            debug('Got result-snipper')
            assert self.state == ParserState.TITLE_PARSED, self.state
            self.state = ParserState.IN_SNIPPET
            self.current_snippet = []

        elif tag == 'span' and 'link-text' in classes:
            # 5. This is the link, after a snippet if any. We're catching it
            #    detect results without a snippet. If so, transition directly
            #    from TITLE_PARSED to OUTSIDE
            debug('Got link-text')
            if self.state == ParserState.TITLE_PARSED:
                # No snippet
                self.state = ParserState.OUTSIDE
                self.current_snippet = []

    def handle_endtag(self, tag):
        if tag in STACKED_TAGS:
            item = self.stack.pop()
            assert item[0] == tag, (item, tag)

        if tag == 'a' and self.state == ParserState.IN_TITLE:
            # 2. End of the <a> node matched in step 1; transition from
            #    IN_TITLE to TITLE_PARSED
            debug('Title parsed')
            self.state = ParserState.TITLE_PARSED
        elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
            # 4. End of the <td> node matched in step 3, this concludes the
            #    parsing of this result. Transition from IN_SNIPPET to OUTSIDE
            debug('Snippet parsed')
            self.build_result()
            self.state = ParserState.OUTSIDE

    def handle_data(self, data):
        if self.state == ParserState.IN_TITLE:
            debug('Got title data: %s', data)
            self.current_title.append(data)
        elif self.state == ParserState.IN_SNIPPET:
            debug('Got snippet data: %s', data)
            self.current_snippet.append(data)

    def build_result(self):
        res = result(
            link=self.current_link,
            title=''.join(self.current_title),
            snippet=''.join(self.current_snippet),
        )
        debug('Finished parsing result: %r', res)
        self.results.append(res)
        self.reset_current_result()

if __name__ == '__main__':
    parser = DDGHTMLParser()
    with open('ddg2.html') as fd:
        parser.feed(fd.read())
    print(parser.results)
DDG: Add missing copyright notice. 2020-05-15 21:36:31 +02:00			`###`
			`# Copyright (c) 2020, Valentin Lorentz`
			`# All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions, and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions, and the following disclaimer in the`
			`# documentation and/or other materials provided with the distribution.`
			`# * Neither the name of the author of this software nor the name of`
			`# contributors to this software may be used to endorse or promote products`
			`# derived from this software without specific prior written consent.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`

			`###`

DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`import enum`
			`import collections`
			`from html.parser import HTMLParser`

DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`DEBUG = False`

			`def debug(msg, *args):`
			`if DEBUG:`
			`print(msg % args)`

DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`result = collections.namedtuple('result', 'link title snippet')`

			`@enum.unique`
			`class ParserState(enum.Enum):`
			`OUTSIDE = 0`
			`IN_TITLE = 1`
			`TITLE_PARSED = 2`
			`IN_SNIPPET = 3`

DDG: better document the transitions in the parser. 2020-07-01 20:05:12 +02:00
			`# This is implemented as a stack automaton. Here is the transition graph.`
			`# See comments below to find the description of each transition`
			`#`
			`# --> OUTSIDE --(1)--> IN_TITLE --(2)--> TITLE_PARSED --(3)--> IN_SNIPPET`
			`# ^ ^ \| \|`
			`# \| \| \| \|`
			`# \| +----------------(5)---------------+ \|`
			`# \| \|`
			`# +------------------------------(4)--------------------------+`


DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`STACKED_TAGS = ('table', 'tr', 'td', 'a')`

			`class DDGHTMLParser(HTMLParser):`
			`def __init__(self):`
			`super().__init__()`
			`self.stack = []`
			`self.results = []`

			`self.reset_current_result()`

			`def reset_current_result(self):`
			`self.state = ParserState.OUTSIDE`
			`self.current_link = None`
			`self.current_title = None`
			`self.current_snippet = None`

			`def handle_starttag(self, tag, attrs):`
			`attrs = dict(attrs)`
			`classes = attrs.get('class', '').split()`

			`if tag in STACKED_TAGS:`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Stacking %s with classes %s', tag, classes)`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`self.stack.append((tag, classes))`

			`if ('tr', ['result-sponsored']) in self.stack:`
			`# Skip sponsored results`
			`return`

			`if tag == 'a' and 'result-link' in classes:`
DDG: better document the transitions in the parser. 2020-07-01 20:05:12 +02:00			`# 1. Starts the title of a result; transition from OUTSIDE`
			`# to IN_TITLE`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Got result-link')`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)`
			`self.state = ParserState.IN_TITLE`
			`self.current_link = attrs['href']`
			`self.current_title = []`

			`elif tag == 'td' and 'result-snippet' in classes:`
DDG: better document the transitions in the parser. 2020-07-01 20:05:12 +02:00			`# 3. Starts a snippet. Normally, just after a title ended.`
			`# Transition from TITLE_PARSED to IN_SNIPPET`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Got result-snipper')`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`assert self.state == ParserState.TITLE_PARSED, self.state`
			`self.state = ParserState.IN_SNIPPET`
			`self.current_snippet = []`

			`elif tag == 'span' and 'link-text' in classes:`
DDG: better document the transitions in the parser. 2020-07-01 20:05:12 +02:00			`# 5. This is the link, after a snippet if any. We're catching it`
			`# detect results without a snippet. If so, transition directly`
			`# from TITLE_PARSED to OUTSIDE`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Got link-text')`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`if self.state == ParserState.TITLE_PARSED:`
			`# No snippet`
			`self.state = ParserState.OUTSIDE`
			`self.current_snippet = []`

			`def handle_endtag(self, tag):`
			`if tag in STACKED_TAGS:`
			`item = self.stack.pop()`
			`assert item[0] == tag, (item, tag)`

			`if tag == 'a' and self.state == ParserState.IN_TITLE:`
DDG: better document the transitions in the parser. 2020-07-01 20:05:12 +02:00			`# 2. End of the <a> node matched in step 1; transition from`
			`# IN_TITLE to TITLE_PARSED`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Title parsed')`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`self.state = ParserState.TITLE_PARSED`
			`elif tag == 'td' and self.state == ParserState.IN_SNIPPET:`
DDG: better document the transitions in the parser. 2020-07-01 20:05:12 +02:00			`# 4. End of the <td> node matched in step 3, this concludes the`
			`# parsing of this result. Transition from IN_SNIPPET to OUTSIDE`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Snippet parsed')`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`self.build_result()`
			`self.state = ParserState.OUTSIDE`

			`def handle_data(self, data):`
			`if self.state == ParserState.IN_TITLE:`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Got title data: %s', data)`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`self.current_title.append(data)`
			`elif self.state == ParserState.IN_SNIPPET:`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`debug('Got snippet data: %s', data)`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`self.current_snippet.append(data)`

			`def build_result(self):`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`res = result(`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`link=self.current_link,`
			`title=''.join(self.current_title),`
			`snippet=''.join(self.current_snippet),`
DDG: Add debug prints in the parser. 2020-07-01 10:34:13 +02:00			`)`
			`debug('Finished parsing result: %r', res)`
			`self.results.append(res)`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`self.reset_current_result()`

			`if __name__ == '__main__':`
			`parser = DDGHTMLParser()`
			`with open('ddg2.html') as fd:`
			`parser.feed(fd.read())`
			`print(parser.results)`