3
0
mirror of https://github.com/pragma-/pbot.git synced 2024-11-29 23:39:24 +01:00
pbot/lib/PBot/Plugin/UrlTitles.pm

343 lines
10 KiB
Perl
Raw Normal View History

# File: UrlTitles.pm
#
# Purpose: Display titles of URLs in channel messages.
# SPDX-FileCopyrightText: 2021, 2022 Pragmatic Software <pragma78@gmail.com>
2021-07-11 00:00:22 +02:00
# SPDX-License-Identifier: MIT
License project under MPL2 This patch adds the file LICENSE which is the verbatim copy of the Mozilla Public License Version 2.0 as retreived from https://www.mozilla.org/media/MPL/2.0/index.815ca599c9df.txt on 2017-03-05. This patch also places license headers for the MPL2 type A variant of the license header in the following files: PBot/AntiFlood.pm PBot/BanTracker.pm PBot/BlackList.pm PBot/BotAdminCommands.pm PBot/BotAdmins.pm PBot/ChanOpCommands.pm PBot/ChanOps.pm PBot/Channels.pm PBot/Commands.pm PBot/DualIndexHashObject.pm PBot/EventDispatcher.pm PBot/FactoidCommands.pm PBot/FactoidModuleLauncher.pm PBot/Factoids.pm PBot/HashObject.pm PBot/IRCHandlers.pm PBot/IgnoreList.pm PBot/IgnoreListCommands.pm PBot/Interpreter.pm PBot/LagChecker.pm PBot/Logger.pm PBot/MessageHistory.pm PBot/MessageHistory_SQLite.pm PBot/NickList.pm PBot/PBot.pm PBot/Plugins.pm PBot/Plugins/AntiAway.pm PBot/Plugins/AntiKickAutoRejoin.pm PBot/Plugins/AntiRepeat.pm PBot/Plugins/AntiTwitter.pm PBot/Plugins/AutoRejoin.pm PBot/Plugins/Counter.pm PBot/Plugins/Quotegrabs.pm PBot/Plugins/Quotegrabs/Quotegrabs_Hashtable.pm PBot/Plugins/Quotegrabs/Quotegrabs_SQLite.pm PBot/Plugins/UrlTitles.pm PBot/Plugins/_Example.pm PBot/Refresher.pm PBot/Registerable.pm PBot/Registry.pm PBot/RegistryCommands.pm PBot/SQLiteLogger.pm PBot/SQLiteLoggerLayer.pm PBot/SelectHandler.pm PBot/StdinReader.pm PBot/Timer.pm PBot/Utils/ParseDate.pm PBot/VERSION.pm build/update-version.pl modules/acronym.pl modules/ago.pl modules/c11std.pl modules/c2english.pl modules/c2english/CGrammar.pm modules/c2english/c2eng.pl modules/c99std.pl modules/cdecl.pl modules/cfaq.pl modules/cjeopardy/IRCColors.pm modules/cjeopardy/QStatskeeper.pm modules/cjeopardy/Scorekeeper.pm modules/cjeopardy/cjeopardy.pl modules/cjeopardy/cjeopardy_answer.pl modules/cjeopardy/cjeopardy_filter.pl modules/cjeopardy/cjeopardy_hint.pl modules/cjeopardy/cjeopardy_qstats.pl modules/cjeopardy/cjeopardy_scores.pl modules/cjeopardy/cjeopardy_show.pl modules/codepad.pl modules/compiler_block.pl modules/compiler_client.pl modules/compiler_vm/Diff.pm modules/compiler_vm/cc modules/compiler_vm/compiler_client.pl modules/compiler_vm/compiler_server.pl modules/compiler_vm/compiler_server_vbox_win32.pl modules/compiler_vm/compiler_server_watchdog.pl modules/compiler_vm/compiler_vm_client.pl modules/compiler_vm/compiler_vm_server.pl modules/compiler_vm/compiler_watchdog.pl modules/compiler_vm/languages/_c_base.pm modules/compiler_vm/languages/_default.pm modules/compiler_vm/languages/bash.pm modules/compiler_vm/languages/bc.pm modules/compiler_vm/languages/bf.pm modules/compiler_vm/languages/c11.pm modules/compiler_vm/languages/c89.pm modules/compiler_vm/languages/c99.pm modules/compiler_vm/languages/clang.pm modules/compiler_vm/languages/clang11.pm modules/compiler_vm/languages/clang89.pm modules/compiler_vm/languages/clang99.pm modules/compiler_vm/languages/clangpp.pm modules/compiler_vm/languages/clisp.pm modules/compiler_vm/languages/cpp.pm modules/compiler_vm/languages/freebasic.pm modules/compiler_vm/languages/go.pm modules/compiler_vm/languages/haskell.pm modules/compiler_vm/languages/java.pm modules/compiler_vm/languages/javascript.pm modules/compiler_vm/languages/ksh.pm modules/compiler_vm/languages/lua.pm modules/compiler_vm/languages/perl.pm modules/compiler_vm/languages/python.pm modules/compiler_vm/languages/python3.pm modules/compiler_vm/languages/qbasic.pm modules/compiler_vm/languages/scheme.pm modules/compiler_vm/languages/server/_c_base.pm modules/compiler_vm/languages/server/_default.pm modules/compiler_vm/languages/server/c11.pm modules/compiler_vm/languages/server/c89.pm modules/compiler_vm/languages/server/c99.pm modules/compiler_vm/languages/server/clang.pm modules/compiler_vm/languages/server/clang11.pm modules/compiler_vm/languages/server/clang89.pm modules/compiler_vm/languages/server/clang99.pm modules/compiler_vm/languages/server/cpp.pm modules/compiler_vm/languages/server/freebasic.pm modules/compiler_vm/languages/server/haskell.pm modules/compiler_vm/languages/server/java.pm modules/compiler_vm/languages/server/qbasic.pm modules/compiler_vm/languages/server/tendra.pm modules/compiler_vm/languages/sh.pm modules/compiler_vm/languages/tendra.pm modules/compliment modules/cstd.pl modules/define.pl modules/dice_roll.pl modules/excuse.sh modules/expand_macros.pl modules/fnord.pl modules/funnyish_quote.pl modules/g.pl modules/gdefine.pl modules/gen_cfacts.pl modules/gencstd.pl modules/get_title.pl modules/getcfact.pl modules/google.pl modules/gspy.pl modules/gtop10.pl modules/gtop15.pl modules/headlines.pl modules/horoscope modules/horrorscope modules/ideone.pl modules/insult.pl modules/love_quote.pl modules/man.pl modules/map.pl modules/math.pl modules/prototype.pl modules/qalc.pl modules/random_quote.pl modules/seen.pl modules/urban modules/weather.pl modules/wikipedia.pl pbot.pl pbot.sh It is highly recommended that this list of files is reviewed to ensure that all files are the copyright of the sole maintainer of the repository. If any files with license headers contain the intellectual property of anyone else, it is recommended that a request is made to revise this patch or that the explicit permission of the co-author is gained to allow for the license of the work to be changed. I (Tomasz Kramkowski), the contributor, take no responsibility for any legal action taken against the maintainer of this repository for incorrectly claiming copyright to any work not owned by the maintainer of this repository.
2017-03-05 22:33:31 +01:00
2021-07-14 04:45:56 +02:00
package PBot::Plugin::UrlTitles;
use parent 'PBot::Plugin::Base';
2021-06-19 06:23:34 +02:00
use PBot::Imports;
2019-07-11 03:40:53 +02:00
use Encode;
use Text::Levenshtein::XS qw(distance);
use LWP::UserAgent::Paranoid;
use HTML::Entities;
use JSON::XS;
use constant {
TIMEOUT => 30,
2022-03-18 03:17:31 +01:00
MAX_SIZE => 1024 * 800,
};
sub initialize {
2020-02-15 23:38:32 +01:00
my ($self, %conf) = @_;
# remember recent titles so we don't repeat them too often
my $filename = $self->{pbot}->{registry}->get_value('general', 'data_dir') . '/url-title.hist';
$self->{history} = PBot::Core::Storage::DualIndexHashObject->new(
pbot => $self->{pbot},
name => 'URL title history',
filename => $filename,
);
$self->{history}->load;
# can be overridden per-channel
$self->{pbot}->{registry}->add_default('text', 'general', 'show_url_titles', $conf{show_url_titles} // 1);
2022-02-28 02:08:30 +01:00
# handle these events
2020-02-15 23:38:32 +01:00
$self->{pbot}->{event_dispatcher}->register_handler('irc.public', sub { $self->show_url_titles(@_) });
$self->{pbot}->{event_dispatcher}->register_handler('irc.caction', sub { $self->show_url_titles(@_) });
}
sub unload {
2020-02-15 23:38:32 +01:00
my ($self) = @_;
$self->{pbot}->{event_dispatcher}->remove_handler('irc.public');
$self->{pbot}->{event_dispatcher}->remove_handler('irc.caction');
}
sub is_ignored_url {
my ($self, $url) = @_;
2022-02-28 02:08:30 +01:00
return 1 if $url =~ m{https?://matrix\.to}i;
return 1 if $url =~ m{https?://.*\.c$}i;
return 1 if $url =~ m{https?://.*\.h$}i;
return 1 if $url =~ m{https?://ibb.co/}i;
return 1 if $url =~ m{https?://.*onlinegdb.com}i;
return 1 if $url =~ m{googlesource.com/}i;
return 1 if $url =~ m{https?://git}i and $url !~ /commit/i and $url !~ /github.com/;
return 1 if $url =~ m{https://.*swissborg.com}i;
return 1 if $url =~ m{https://streamable.com}i;
return 1 if $url =~ m{https://matrix.org}i;
return 1 if $url =~ m{https?://coliru\..*}i;
return 1 if $url =~ m{localhost}i;
return 1 if $url =~ m{127}i;
return 1 if $url =~ m{192.168}i;
return 1 if $url =~ m{file://}i;
return 1 if $url =~ m{\.\.}i;
return 1 if $url =~ m{https?://www.irccloud.com/pastebin}i;
return 1 if $url =~ m{http://smuj.ca/cl}i;
return 1 if $url =~ m{/man\d+/}i;
return 1 if $url =~ m{godbolt.org}i;
return 1 if $url =~ m{man\.cgi}i;
return 1 if $url =~ m{wandbox}i;
return 1 if $url =~ m{ebay.com/itm}i;
return 1 if $url =~ m/prntscr.com/i;
return 1 if $url =~ m/imgbin.org/i;
return 1 if $url =~ m/jsfiddle.net/i;
return 1 if $url =~ m/port70.net/i;
return 1 if $url =~ m/notabug.org/i;
return 1 if $url =~ m/flickr.com/i;
return 1 if $url =~ m{www.open-std.org/jtc1/sc22/wg14/www/docs/dr}i;
return 1 if $url =~ m/cheezburger/i;
return 1 if $url =~ m/rafb.me/i;
return 1 if $url =~ m/rextester.com/i;
return 1 if $url =~ m/explosm.net/i;
return 1 if $url =~ m/stackoverflow.com/i;
return 1 if $url =~ m/scratch.mit.edu/i;
return 1 if $url =~ m/c-faq.com/i;
return 1 if $url =~ m/imgur.com/i;
return 1 if $url =~ m/sprunge.us/i;
return 1 if $url =~ m/pastebin.ws/i;
return 1 if $url =~ m/hastebin.com/i;
return 1 if $url =~ m/lmgtfy.com/i;
return 1 if $url =~ m/gyazo/i;
return 1 if $url =~ m/imagebin/i;
return 1 if $url =~ m/\/wiki\//i;
return 1 if $url =~ m!github.com/.*/tree/.*/source/.*!i;
return 1 if $url =~ m!github.com/.*/commits/.*!i;
return 1 if $url =~ m!/blob/!i;
return 1 if $url =~ m/wiki.osdev.org/i;
return 1 if $url =~ m/wikipedia.org/i;
return 1 if $url =~ m/fukung.net/i;
return 1 if $url =~ m/\/paste\//i;
return 1 if $url =~ m/paste\./i;
return 1 if $url =~ m/pastie/i;
return 1 if $url =~ m/ideone.com/i;
return 1 if $url =~ m/codepad.org/i;
return 1 if $url =~ m/^http\:\/\/past(e|ing)\./i;
return 1 if $url =~ m/past(?:e|ing).*\.(?:com|org|net|ch|ca|de|uk|info)/i;
# not ignored
return 0;
}
sub is_ignored_title {
my ($self, $title) = @_;
2022-02-28 02:08:30 +01:00
return 1 if $title =~ m{^Loading}i;
return 1 if $title =~ m{streamable}i;
return 1 if $title =~ m{^IBM Knowledge Center$}i;
return 1 if $title =~ m{Freenode head of infrastructure}i;
return 1 if $title =~ m/^Coliru Viewer$/i;
return 1 if $title =~ m/^Gerrit Code Review$/i;
return 1 if $title =~ m/^Public Git Hosting -/i;
return 1 if $title =~ m/git\/blob/i;
return 1 if $title =~ m/\sdiff\s/i;
return 1 if $title =~ m/- Google Search$/;
return 1 if $title =~ m/linux cross reference/i;
return 1 if $title =~ m/screenshot/i;
return 1 if $title =~ m/pastebin/i;
return 1 if $title =~ m/past[ea]/i;
return 1 if $title =~ m/^[0-9_-]+$/;
return 1 if $title =~ m/^Index of \S+$/;
return 1 if $title =~ m/(?:sign up|login)/i;
# not ignored
return 0;
}
sub get_title {
my ($self, $context) = @_;
my $url = $context->{arguments};
my $ua = LWP::UserAgent::Paranoid->new(request_timeout => TIMEOUT);
2022-03-18 03:17:31 +01:00
my $user_agent;
if ($url =~ /twitter.com/) {
$user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +https://www.google.com/bot.html)',
} else {
$user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0';
}
$ua->agent($user_agent);
$ua->max_size(MAX_SIZE);
my $response = $ua->get($url);
if (not $response->is_success) {
$self->{pbot}->{logger}->log("Error getting URL [$url]\n");
return 0;
}
my $title;
if ($response->title) {
$title = decode('UTF-8', $response->title);
} else {
my $text = $response->decoded_content;
if ($text =~ m/<title>(.*?)<\/title>/msi) {
$title = $1;
}
}
if (not defined $title or not length $title) {
$self->{pbot}->{logger}->log("No title for URL [$url]\n");
return 0;
}
$title = decode_entities($title);
# disregard one-word titles; these aren't usually interesting
# (and are usually already present in the URL itself)
return 0 if $title !~ /\s/;
# truncate long title
if (length $title > 400) {
$title = substr($title, 0, 400);
$title = "$title [...]";
}
# fuzzy compare file against title
my ($file) = $url =~ m/.*\/(.*)$/;
$file =~ s/[_-]+/ /g;
my $distance = distance(lc $file, lc $title);
my $length = (length $file > length $title) ? length $file : length $title;
# disregard title if 75%+ similiar to file
return 0 if $distance / $length < 0.75;
# disregard ignored titles
return 0 if $self->is_ignored_title($title);
# send result back to parent
$context->{result} = $title;
$context->{url} = $url;
}
sub title_pipe_reader {
my ($self, $pid, $buf) = @_;
# retrieve context object from child
my $context = decode_json $buf or do {
$self->{pbot}->{logger}->log("Failed to decode bad json: [$buf]\n");
return;
};
# context is no longer forked
delete $context->{pid};
my $title = delete $context->{result};
return 0 if not defined $title or not length $title;
# disregard recent titles (15 min)
my $data = $self->{history}->get_data($context->{from}, $title);
if (defined $data) {
if (time - $data->{timestamp} < 900) {
return 0;
}
}
# update history
$data = {
url => $context->{url},
timestamp => time,
hostmask => $context->{hostmask},
};
$self->{history}->add($context->{from}, $title, $data, 0, 1);
# set result
$context->{result} = "Title of $context->{nick}'s link: $title";
# send result off to bot to be handled
$context->{checkflood} = 1;
$self->{pbot}->{interpreter}->handle_result($context);
}
sub show_url_titles {
2020-02-15 23:38:32 +01:00
my ($self, $event_type, $event) = @_;
my ($nick, $user, $host) = (
$event->{event}->nick,
$event->{event}->user,
$event->{event}->host
);
my ($channel, $msg) = (
$event->{event}->{to}[0],
$event->{event}->{args}[0]
);
# get show_url_titles for channel or true if not defined
my $enabled = $self->{pbot}->{registry}->get_value($channel, 'show_url_titles') // 1;
# disabled in channel
return 0 if !$enabled;
return 0 if $self->{pbot}->{registry}->get_value($channel, 'no_url_titles');
# disabled globally (unless allowed by channel)
return 0 if !$self->{pbot}->{registry}->get_value('general', 'show_url_titles') && !$enabled;
# message already handled by bot command
2020-02-15 23:38:32 +01:00
return 0 if $event->{interpreted};
# no url in message
return 0 if not $msg =~ m/https?:\/\/[^\s]/;
# ignored user
return 0 if $self->{pbot}->{ignorelist}->is_ignored($channel, "$nick!$user\@$host");
2020-02-15 23:38:32 +01:00
# no titles for unidentified users in +z channels
my $chanmodes = $self->{pbot}->{channels}->get_meta($channel, 'MODE');
2020-02-15 23:38:32 +01:00
if (defined $chanmodes and $chanmodes =~ m/z/) {
my $account = $self->{pbot}->{messagehistory}->{database}->get_message_account($nick, $user, $host);
my $nickserv = $self->{pbot}->{messagehistory}->{database}->get_current_nickserv_account($account);
return 0 if not defined $nickserv or not length $nickserv;
}
my $count = 0;
while ($msg =~ s/(https?:\/\/[^\s]+)//i && ++$count <= 3) {
my $url = $1;
$url =~ s/\W$//;
2022-02-28 02:11:55 +01:00
$url =~ s,https://mobile.twitter.com,https://twitter.com,i;
if ($self->{pbot}->{antispam}->is_spam('url', $url)) {
$self->{pbot}->{logger}->log("Ignoring spam URL $url\n");
next;
2020-02-15 23:38:32 +01:00
}
if ($self->is_ignored_url($url)) {
$self->{pbot}->{logger}->log("Ignoring URL $url\n");
next;
}
my $context = {
from => $channel,
nick => $nick,
user => $user,
host => $host,
hostmask => "$nick!$user\@$host",
command => "title $nick $url",
root_channel => $channel,
root_keyword => "title",
keyword => "title",
arguments => $url,
suppress_no_output => 1,
};
$self->{pbot}->{process_manager}->execute_process(
$context,
sub { $self->get_title(@_) },
30,
sub { $self->title_pipe_reader(@_) },
);
}
2020-02-15 23:38:32 +01:00
return 0;
}
1;