From eaf722250938253f1aa7def5b79bd4ccc55976b6 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 14 May 2020 21:33:34 +0200 Subject: [PATCH] Fediverse: Cache host support for webfinger before snarfing. This is much cheaper both for us and the host if the host doesn't support activitypub at all (which is what happens most of the time). --- plugins/Fediverse/activitypub.py | 42 ++++++++++++++++ plugins/Fediverse/plugin.py | 38 +++++++++++++- plugins/Fediverse/test.py | 85 +++++++++++++++++++++++++++----- 3 files changed, 151 insertions(+), 14 deletions(-) diff --git a/plugins/Fediverse/activitypub.py b/plugins/Fediverse/activitypub.py index 99d556cbd..895199367 100644 --- a/plugins/Fediverse/activitypub.py +++ b/plugins/Fediverse/activitypub.py @@ -34,6 +34,7 @@ import email import base64 import functools import contextlib +import urllib.error import urllib.parse import xml.etree.ElementTree as ET @@ -129,6 +130,47 @@ def _get_webfinger_url(hostname): return "https://%s/.well-known/webfinger?resource={uri}" +def has_webfinger_support(hostname): + """Returns whether the hostname probably supports webfinger or not. + + This relies on an edge case of the Webfinger specification, + so it may not successfully detect some hosts because they don't follow + the specification.""" + request = urllib.request.Request( + "https://%s/.well-known/webfinger" % hostname, method="HEAD" + ) + try: + urllib.request.urlopen(request) + except urllib.error.HTTPError as e: + if e.code == 400: + # RFC 7033 requires a 400 response when the "resource" parameter + # is missing: https://tools.ietf.org/html/rfc7033#section-4.2 + # + # This works for: + # * Misskey + # * PeerTube + # * Pleroma + return True + elif e.headers.get("Content-Type", "") == "application/jrd+json": + # WriteFreely, and possibly others. + # https://github.com/writeas/writefreely/issues/310 + return True + elif e.code == 404: + if e.headers.get("Server", "").lower() == "mastodon": + # https://github.com/tootsuite/mastodon/issues/13757 + return True + + # Else, the host probably doesn't support Webfinger. + + # Known false negatives: + # * Nextcloud (returns 404) + # * Pixelfed (returns 302 to the homepage): + # https://github.com/pixelfed/pixelfed/issues/2180 + # * Plume (returns 404): + # https://github.com/Plume-org/Plume/issues/770 + return False + + def webfinger(hostname, uri): template = _get_webfinger_url(hostname) assert template diff --git a/plugins/Fediverse/plugin.py b/plugins/Fediverse/plugin.py index a44577205..3e2617239 100644 --- a/plugins/Fediverse/plugin.py +++ b/plugins/Fediverse/plugin.py @@ -125,6 +125,14 @@ class Fediverse(callbacks.PluginRegexp): self._startHttp() self._actor_cache = utils.structures.TimeoutDict(timeout=600) + # Used when snarfing, to cheaply avoid querying non-ActivityPub + # servers. + # Is also written to when using commands that successfully find + # ActivityPub data. + self._webfinger_support_cache = utils.structures.TimeoutDict( + timeout=60 * 60 * 24 + ) + def _startHttp(self): callback = FediverseHttp() callback._plugin = self @@ -137,6 +145,13 @@ class Fediverse(callbacks.PluginRegexp): def _stopHttp(self): httpserver.unhook("fediverse") + def _has_webfinger_support(self, hostname): + if hostname not in self._webfinger_support_cache: + self._webfinger_support_cache[hostname] = ap.has_webfinger_support( + hostname + ) + return self._webfinger_support_cache[hostname] + def _get_actor(self, irc, username): if username in self._actor_cache: return self._actor_cache[username] @@ -167,6 +182,8 @@ class Fediverse(callbacks.PluginRegexp): if username: self._actor_cache[username] = actor + self._webfinger_support_cache[hostname] = True + self._actor_cache[actor["id"]] = actor return actor @@ -257,9 +274,17 @@ class Fediverse(callbacks.PluginRegexp): "snarfers.username", msg.channel, irc.network ): return + + if not self._has_webfinger_support(match.group("hostname")): + self.log.debug( + "Not snarfing, host doesn't have Webfinger support." + ) + return + try: actor = self._get_actor(irc, match.group(0)) - except ap.ActivityPubError: + except ap.ActivityPubError as e: + self.log.info("Could not fetch %s: %s", match.group(0), e) # Be silent on errors return @@ -282,6 +307,14 @@ class Fediverse(callbacks.PluginRegexp): snarf_status = self.registryValue("snarfers.status", channel, network) if not snarf_profile and not snarf_status: return + + hostname = urllib.parse.urlparse(url).hostname + if not self._has_webfinger_support(hostname): + self.log.debug( + "Not snarfing, host doesn't have Webfinger support." + ) + return + try: resource = ap.get_resource_from_url(url) except ap.ActivityPubError: @@ -357,6 +390,9 @@ class Fediverse(callbacks.PluginRegexp): status = ap.get_resource_from_url(url) except ap.ActivityPubError as e: irc.error(_("Could not get status: %s") % e.args[0], Raise=True) + else: + hostname = urllib.parse.urlparse(url).hostname + self._webfinger_support_cache[hostname] = True irc.reply(self._format_status(irc, msg, status)) diff --git a/plugins/Fediverse/test.py b/plugins/Fediverse/test.py index 8a3445460..7c4e6d367 100644 --- a/plugins/Fediverse/test.py +++ b/plugins/Fediverse/test.py @@ -34,9 +34,10 @@ import functools import contextlib from multiprocessing import Manager -from supybot import commands, conf, utils +from supybot import conf, log, utils from supybot.test import ChannelPluginTestCase, network +from . import activitypub as ap from .test_data import ( PRIVATE_KEY, HOSTMETA_URL, @@ -89,10 +90,31 @@ class NetworkedFediverseTestCase(BaseFediverseTestCase): "Error: Unknown user @nonexistinguser@oc.todon.fr.", ) + def testHasWebfingerSupport(self): + self.assertTrue(ap.has_webfinger_support("oc.todon.fr")) + self.assertFalse(ap.has_webfinger_support("example.org")) + class NetworklessFediverseTestCase(BaseFediverseTestCase): timeout = 0.1 + @contextlib.contextmanager + def mockWebfingerSupport(self, value): + original_has_webfinger_support = ap.has_webfinger_support + + @functools.wraps(original_has_webfinger_support) + def newf(hostname): + if value == "not called": + assert False + assert type(value) is bool + return value + + ap.has_webfinger_support = newf + + yield + + ap.has_webfinger_support = original_has_webfinger_support + @contextlib.contextmanager def mockRequests(self, expected_requests): with Manager() as m: @@ -105,6 +127,7 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): assert expected_requests, url (expected_url, response) = expected_requests.pop(0) self.assertEqual(url, expected_url, "Unexpected URL: %s" % url) + log.debug("Got request to %s", url) if isinstance(response, bytes): return response @@ -225,7 +248,7 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): ) def testProfileSnarfer(self): - with self.mockRequests([]): + with self.mockWebfingerSupport("not called"), self.mockRequests([]): self.assertSnarfNoResponse("aaa @nonexistinguser@example.org bbb") with conf.supybot.plugins.Fediverse.snarfers.username.context(True): @@ -235,24 +258,46 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): (ACTOR_URL, ACTOR_DATA), ] - with self.mockRequests(expected_requests): + # First request, should work + with self.mockWebfingerSupport(True), self.mockRequests( + expected_requests + ): self.assertSnarfResponse( "aaa @someuser@example.org bbb", "\x02someuser\x02 (@someuser@example.org): My Biography", ) + # Same request; it is all cached + with self.mockWebfingerSupport("not called"), self.mockRequests( + [] + ): + self.assertSnarfResponse( + "aaa @someuser@example.org bbb", + "\x02someuser\x02 (@someuser@example.org): My Biography", + ) + + # Nonexisting user + expected_requests = [ (HOSTMETA_URL, HOSTMETA_DATA), (WEBFINGER_URL, utils.web.Error("blah")), ] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport("not called"), self.mockRequests( + expected_requests + ): self.assertSnarfNoResponse( "aaa @nonexistinguser@example.org bbb" ) + def testProfileSnarferNoWebfinger(self): + with conf.supybot.plugins.Fediverse.snarfers.username.context(False): + # No webfinger support, shouldn't make requests + with self.mockWebfingerSupport(False), self.mockRequests([]): + self.assertSnarfNoResponse("aaa @someuser@example.org bbb") + def testProfileUrlSnarfer(self): - with self.mockRequests([]): + with self.mockWebfingerSupport("not called"), self.mockRequests([]): self.assertSnarfNoResponse( "aaa https://example.org/users/someuser bbb" ) @@ -260,14 +305,18 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): with conf.supybot.plugins.Fediverse.snarfers.profile.context(True): expected_requests = [(ACTOR_URL, utils.web.Error("blah"))] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport(True), self.mockRequests( + expected_requests + ): self.assertSnarfNoResponse( "aaa https://example.org/users/someuser bbb" ) expected_requests = [(ACTOR_URL, ACTOR_DATA)] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport("not called"), self.mockRequests( + expected_requests + ): self.assertSnarfResponse( "aaa https://example.org/users/someuser bbb", "\x02someuser\x02 (@someuser@example.org): My Biography", @@ -363,7 +412,7 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): ) def testStatusUrlSnarferDisabled(self): - with self.mockRequests([]): + with self.mockWebfingerSupport("not called"), self.mockRequests([]): self.assertSnarfNoResponse( "aaa https://example.org/users/someuser/statuses/1234 bbb" ) @@ -375,7 +424,9 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): (ACTOR_URL, ACTOR_DATA), ] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport(True), self.mockRequests( + expected_requests + ): self.assertSnarfResponse( "aaa https://example.org/users/someuser/statuses/1234 bbb", "\x02someuser\x02 (@someuser@example.org): " @@ -386,7 +437,9 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): with conf.supybot.plugins.Fediverse.snarfers.status.context(True): expected_requests = [(STATUS_URL, utils.web.Error("blah"))] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport(True), self.mockRequests( + expected_requests + ): self.assertSnarfNoResponse( "aaa https://example.org/users/someuser/statuses/1234 bbb" ) @@ -396,7 +449,9 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): (ACTOR_URL, utils.web.Error("blah")), ] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport("not called"), self.mockRequests( + expected_requests + ): self.assertSnarfResponse( "aaa https://example.org/users/someuser/statuses/1234 bbb", ": @ FirstAuthor I am replying to you", @@ -407,7 +462,9 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): with conf.supybot.plugins.Fediverse.snarfers.profile.context(True): expected_requests = [(STATUS_URL, STATUS_DATA)] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport(True), self.mockRequests( + expected_requests + ): self.assertSnarfNoResponse( "aaa https://example.org/users/someuser/statuses/1234 bbb" ) @@ -416,7 +473,9 @@ class NetworklessFediverseTestCase(BaseFediverseTestCase): with conf.supybot.plugins.Fediverse.snarfers.profile.context(True): expected_requests = [(ACTOR_URL, ACTOR_DATA)] - with self.mockRequests(expected_requests): + with self.mockWebfingerSupport("not called"), self.mockRequests( + expected_requests + ): self.assertSnarfNoResponse( "aaa https://example.org/users/someuser/ bbb" )