Limnoria/plugins/Google/parser.py

###
# Copyright (c) 2020, Valentin Lorentz
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

###

import enum
import collections
from html.parser import HTMLParser

import supybot.utils as utils

result = collections.namedtuple('result', 'link title snippet')

@enum.unique
class ParserState(enum.Enum):
    OUTSIDE = 0
    IN_LINK = 1
    IN_TITLE = 2
    TITLE_PARSED = 3
    BREADCRUMBS_PARSED = 5
    LINK_PARSED = 6

@enum.unique
class DomMark(enum.Enum):
    """A mark on an element in the stack, to know when to change state when
    poping the element from the stack."""
    HEADING = 1
    BREADCRUMBS = 2

STACKED_TAGS = ('div', 'span', 'a')

class GoogleHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.stack = []
        self.results = []

        self.reset_current_result()

    def reset_current_result(self):
        self.state = ParserState.OUTSIDE
        self.current_link = None
        self.current_title = None
        self.current_snippet = None

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        classes = attrs.get('class', '').split()

        if tag in STACKED_TAGS:
            self.stack.append(tag)

        if tag == 'a' and attrs['href'].startswith('/url?q=') \
                and self.state == ParserState.OUTSIDE:
            self.state = ParserState.IN_LINK
            href = attrs['href'][len('/url?q='):]
            self.current_link = utils.web.urlunquote(href.split('&sa')[0])

        elif tag == 'h3' and 'a' in self.stack and self.state == ParserState.IN_LINK:
            self.state = ParserState.IN_TITLE
            mark = DomMark.HEADING

    def handle_endtag(self, tag):
        if tag in STACKED_TAGS:
            item = self.stack.pop()
            assert item == tag, (item, tag)

        if tag == 'a' and self.state in (
                ParserState.IN_LINK, ParserState.IN_TITLE, ParserState.BREADCRUMBS_PARSED):
            if self.current_title is None:
                # That wasn't a result
                self.state = ParserState.OUTSIDE
            else:
                self.state = ParserState.LINK_PARSED

    def handle_data(self, data):
        if self.state == ParserState.IN_TITLE:
            self.current_title = data
            self.state = ParserState.TITLE_PARSED
        elif self.state == ParserState.TITLE_PARSED:
            self.state = ParserState.BREADCRUMBS_PARSED
        elif self.state == ParserState.LINK_PARSED:
            self.current_snippet = data
            self.state = ParserState.OUTSIDE
            self.build_result()

    def build_result(self):
        self.results.append(result(
            link=self.current_link,
            title=self.current_title,
            snippet=self.current_snippet,
        ))
        self.reset_current_result()

if __name__ == '__main__':
    parser = GoogleHTMLParser()
    with open('google.html') as fd:
        parser.feed(fd.read())
    print(parser.results)
Google: Update, with a proper HTML parser. Hopefully it will be more robust than the regexp. 2020-05-24 18:57:27 +02:00			`###`
			`# Copyright (c) 2020, Valentin Lorentz`
			`# All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions, and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions, and the following disclaimer in the`
			`# documentation and/or other materials provided with the distribution.`
			`# * Neither the name of the author of this software nor the name of`
			`# contributors to this software may be used to endorse or promote products`
			`# derived from this software without specific prior written consent.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`

			`###`

			`import enum`
			`import collections`
			`from html.parser import HTMLParser`

			`import supybot.utils as utils`

			`result = collections.namedtuple('result', 'link title snippet')`

			`@enum.unique`
			`class ParserState(enum.Enum):`
			`OUTSIDE = 0`
			`IN_LINK = 1`
			`IN_TITLE = 2`
			`TITLE_PARSED = 3`
			`BREADCRUMBS_PARSED = 5`
			`LINK_PARSED = 6`

			`@enum.unique`
			`class DomMark(enum.Enum):`
			`"""A mark on an element in the stack, to know when to change state when`
			`poping the element from the stack."""`
			`HEADING = 1`
			`BREADCRUMBS = 2`

			`STACKED_TAGS = ('div', 'span', 'a')`

			`class GoogleHTMLParser(HTMLParser):`
			`def __init__(self):`
			`super().__init__()`
			`self.stack = []`
			`self.results = []`

			`self.reset_current_result()`

			`def reset_current_result(self):`
			`self.state = ParserState.OUTSIDE`
			`self.current_link = None`
			`self.current_title = None`
			`self.current_snippet = None`

			`def handle_starttag(self, tag, attrs):`
			`attrs = dict(attrs)`
			`classes = attrs.get('class', '').split()`

			`if tag in STACKED_TAGS:`
			`self.stack.append(tag)`

Google: Fix false positives on embedded thumbnails (eg. Youtube results). 2020-05-28 19:07:08 +02:00			`if tag == 'a' and attrs['href'].startswith('/url?q=') \`
			`and self.state == ParserState.OUTSIDE:`
Google: Update, with a proper HTML parser. Hopefully it will be more robust than the regexp. 2020-05-24 18:57:27 +02:00			`self.state = ParserState.IN_LINK`
			`href = attrs['href'][len('/url?q='):]`
Google: Fix plugin. 2020-07-01 02:53:02 -04:00			`self.current_link = utils.web.urlunquote(href.split('&sa')[0])`
Google: Update, with a proper HTML parser. Hopefully it will be more robust than the regexp. 2020-05-24 18:57:27 +02:00
Google: Fix plugin. 2020-07-01 04:00:36 -04:00			`elif tag == 'h3' and 'a' in self.stack and self.state == ParserState.IN_LINK:`
Google: Update, with a proper HTML parser. Hopefully it will be more robust than the regexp. 2020-05-24 18:57:27 +02:00			`self.state = ParserState.IN_TITLE`
			`mark = DomMark.HEADING`

			`def handle_endtag(self, tag):`
			`if tag in STACKED_TAGS:`
			`item = self.stack.pop()`
			`assert item == tag, (item, tag)`

			`if tag == 'a' and self.state in (`
			`ParserState.IN_LINK, ParserState.IN_TITLE, ParserState.BREADCRUMBS_PARSED):`
			`if self.current_title is None:`
			`# That wasn't a result`
			`self.state = ParserState.OUTSIDE`
			`else:`
			`self.state = ParserState.LINK_PARSED`

			`def handle_data(self, data):`
			`if self.state == ParserState.IN_TITLE:`
			`self.current_title = data`
			`self.state = ParserState.TITLE_PARSED`
			`elif self.state == ParserState.TITLE_PARSED:`
			`self.state = ParserState.BREADCRUMBS_PARSED`
			`elif self.state == ParserState.LINK_PARSED:`
			`self.current_snippet = data`
			`self.state = ParserState.OUTSIDE`
			`self.build_result()`

			`def build_result(self):`
			`self.results.append(result(`
			`link=self.current_link,`
			`title=self.current_title,`
			`snippet=self.current_snippet,`
			`))`
			`self.reset_current_result()`

			`if __name__ == '__main__':`
			`parser = GoogleHTMLParser()`
			`with open('google.html') as fd:`
			`parser.feed(fd.read())`
			`print(parser.results)`