From fcfda73f64bab7de0ab0a597beac9bb55d94f4b2 Mon Sep 17 00:00:00 2001
From: James Vega <jamessan@users.sourceforge.net>
Date: Sat, 30 Apr 2005 12:53:42 +0000
Subject: [PATCH] Bug #1190350, Don't grab fake title.

---
 plugins/Web/plugin.py | 37 +++++++++++++++++++++++++++++--------
 plugins/Web/test.py   | 10 ++++++++--
 2 files changed, 37 insertions(+), 10 deletions(-)
diff --git a/plugins/Web/plugin.py b/plugins/Web/plugin.py
index 6a9a6798c..cb5c55280 100644
--- a/plugins/Web/plugin.py
+++ b/plugins/Web/plugin.py
@@ -29,6 +29,8 @@
 
 import re
 
+from HTMLParser import HTMLParser
+
 import supybot.conf as conf
 import supybot.utils as utils
 from supybot.commands import *
@@ -36,6 +38,24 @@ import supybot.plugins as plugins
 import supybot.ircutils as ircutils
 import supybot.callbacks as callbacks
 
+class Title(HTMLParser):
+    def __init__(self, *args, **kwargs):
+        self.inTitle = False
+        self.title = None
+        HTMLParser.__init__(self, *args, **kwargs)
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'title':
+            self.inTitle = True
+
+    def handle_data(self, data):
+        if self.inTitle:
+            self.title = data
+
+    def handle_endtag(self, tag):
+        if tag == 'title':
+            self.inTitle = False
+
 class Web(callbacks.PluginRegexp):
     """Add the help for "@help Web" here."""
     threaded = True
@@ -45,8 +65,7 @@ class Web(callbacks.PluginRegexp):
             super(Web, self).callCommand(command, irc, msg, *args, **kwargs)
         except utils.web.Error, e:
             irc.reply(str(e))
-            
-    _titleRe = re.compile(r'<title>(.*?)</title>', re.I | re.S)
+
     def titleSnarfer(self, irc, msg, match):
         r"https?://[^\])>\s]+"
         channel = msg.args[0]
@@ -66,10 +85,11 @@ class Web(callbacks.PluginRegexp):
             except utils.web.Error, e:
                 self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
                 return
-            m = self._titleRe.search(text)
-            if m is not None:
+            parser = Title()
+            parser.feed(text)
+            if parser.title is not None:
                 domain = utils.web.getDomain(url)
-                title = utils.web.htmlToText(m.group(1).strip())
+                title = utils.web.htmlToText(parser.title.strip())
                 s = format('Title: %s (at %s)', title, domain)
                 irc.reply(s, prefixName=False)
     titleSnarfer = urlSnarfer(titleSnarfer)
@@ -137,9 +157,10 @@ class Web(callbacks.PluginRegexp):
         """
         size = conf.supybot.protocols.http.peekSize()
         text = utils.web.getUrl(url, size=size)
-        m = self._titleRe.search(text)
-        if m is not None:
-            irc.reply(utils.web.htmlToText(m.group(1).strip()))
+        parser = Title()
+        parser.feed(text)
+        if parser.title is not None:
+            irc.reply(utils.web.htmlToText(parser.title.strip()))
         else:
             irc.reply(format('That URL appears to have no HTML title '
                              'within the first %i bytes.', size))
diff --git a/plugins/Web/test.py b/plugins/Web/test.py
index be198e844..006b02051 100644
--- a/plugins/Web/test.py
+++ b/plugins/Web/test.py
@@ -29,7 +29,7 @@
 
 from supybot.test import *
 
-class WebTestCase(PluginTestCase):
+class WebTestCase(ChannelPluginTestCase):
     plugins = ('Web',)
     if network:
         def testHeaders(self):
@@ -68,9 +68,15 @@ class WebTestCase(PluginTestCase):
                                 'jupiter_dark_spot_031023.html',
                                 'Mystery Spot on Jupiter Baffles Astronomers')
             # Checks for @title not-working correctly
-            self.assertResponse('title '\
+            self.assertResponse('title '
                 'http://www.catb.org/~esr/jargon/html/F/foo.html',
                 'foo')
+            # Checks for only grabbing the real title tags instead of title
+            # tags inside, for example, script tags. Bug #1190350
+            self.assertNotRegexp('title '
+                'http://www.irinnews.org/report.asp?ReportID=45910&'
+                'SelectRegion=West_Africa&SelectCountry=CHAD',
+                r'document\.write\(')
 
         def testNetcraft(self):
             self.assertNotError('netcraft slashdot.org')