From 4da1291876c8a1d3750f2633dcc71013bddcf01c Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Sun, 30 Oct 2022 20:43:43 +0100 Subject: [PATCH] URL: Lazily deserialize records from the end in @last Before this commit, the plugin first fetched a list of all (deserialized) records in a list, then reversed the list, and iterated on the reverse list. This proved to be slow, with most of the time being spent in `dbi.DB._newRecord` (which essentially deserializes one list of CSV). After this commit, the list is reversed first, then the plugin iterates on its generator, which calls `_newRecord` on records as they are requested. This means that when there are many URLs in the database, `@last` does not need to waste time deserializing most records, when the result is near the end (and if the result is the first record, then it does exactly as much work as before). --- plugins/URL/plugin.py | 18 ++++++++++-------- src/dbi.py | 26 ++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/plugins/URL/plugin.py b/plugins/URL/plugin.py index 2a609a6c5..24921d523 100644 --- a/plugins/URL/plugin.py +++ b/plugins/URL/plugin.py @@ -1,7 +1,7 @@ ### # Copyright (c) 2002-2004, Jeremiah Fincher # Copyright (c) 2010, James McCoy -# Copyright (c) 2010-2021, Valentin Lorentz +# Copyright (c) 2010-2022, Valentin Lorentz # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,6 +29,8 @@ # POSSIBILITY OF SUCH DAMAGE. ### +import itertools + import supybot.dbi as dbi import supybot.conf as conf import supybot.utils as utils @@ -57,9 +59,7 @@ class DbiUrlDB(plugins.DbiChannelDB): near=msg.args[1], at=msg.receivedAt) super(self.__class__, self).add(record) def urls(self, p): - L = list(self.select(p)) - L.reverse() - return L + return self.select(p, reverse=True) URLDB = plugins.DB('URL', {'flat': DbiUrlDB}) @@ -142,16 +142,18 @@ class URL(callbacks.Plugin): if not predicate(record): return False return True - urls = [record.url for record in self.db.urls(channel, predicate)] - if not urls: + urls = (record.url for record in self.db.urls(channel, predicate)) + (urls, urls_copy) = itertools.tee(urls) + first_url = next(urls_copy, None) + if first_url is None: irc.reply(_('No URLs matched that criteria.')) else: if nolimit: - urls = [format('%u', url) for url in urls] + urls = (format('%u', url) for url in urls) s = ', '.join(urls) else: # We should optimize this with another URLDB method eventually. - s = urls[0] + s = first_url irc.reply(s) last = wrap(last, ['channeldb', getopts({'from': 'something', 'with': 'something', diff --git a/src/dbi.py b/src/dbi.py index d5bdfc466..33fa90fbd 100644 --- a/src/dbi.py +++ b/src/dbi.py @@ -1,6 +1,6 @@ ### # Copyright (c) 2002-2005, Jeremiah Fincher -# Copyright (c) 2010-2021, Valentin Lorentz +# Copyright (c) 2010-2022, Valentin Lorentz # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -358,12 +358,30 @@ class DB(object): self.map.remove(id) def __iter__(self): - for (id, s) in self.map: + yield from self._iter() + + def _iter(self, *, reverse=False): + if reverse: + if hasattr(self.map, "__reversed__"): + # neither FlatfileMapping nor CdbMapping support __reversed__ + # and DirMapping does not support iteration at all, but + # there is no harm in allowing this short-path in case + # plugins use their own mapping. + it = reversed(self.map) + else: + # This does load the whole database in memory instead of + # iterating lazily, but plugins requesting a reverse list + # would probably need do it themselves otherwise, so it does + # not make matters worse to do it here. + it = reversed(list(self.map)) + else: + it = self.map + for (id, s) in it: # We don't need to yield the id because it's in the record. yield self._newRecord(id, s) - def select(self, p): - for record in self: + def select(self, p, reverse=False): + for record in self._iter(reverse=reverse): if p(record): yield record