pbot/modules/get_title.pl

221 lines
6.6 KiB
Perl
Raw Normal View History

2014-04-26 17:22:46 +02:00
#!/usr/bin/perl -w
2021-07-11 00:00:22 +02:00
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
# SPDX-License-Identifier: MIT
License project under MPL2 This patch adds the file LICENSE which is the verbatim copy of the Mozilla Public License Version 2.0 as retreived from https://www.mozilla.org/media/MPL/2.0/index.815ca599c9df.txt on 2017-03-05. This patch also places license headers for the MPL2 type A variant of the license header in the following files: PBot/AntiFlood.pm PBot/BanTracker.pm PBot/BlackList.pm PBot/BotAdminCommands.pm PBot/BotAdmins.pm PBot/ChanOpCommands.pm PBot/ChanOps.pm PBot/Channels.pm PBot/Commands.pm PBot/DualIndexHashObject.pm PBot/EventDispatcher.pm PBot/FactoidCommands.pm PBot/FactoidModuleLauncher.pm PBot/Factoids.pm PBot/HashObject.pm PBot/IRCHandlers.pm PBot/IgnoreList.pm PBot/IgnoreListCommands.pm PBot/Interpreter.pm PBot/LagChecker.pm PBot/Logger.pm PBot/MessageHistory.pm PBot/MessageHistory_SQLite.pm PBot/NickList.pm PBot/PBot.pm PBot/Plugins.pm PBot/Plugins/AntiAway.pm PBot/Plugins/AntiKickAutoRejoin.pm PBot/Plugins/AntiRepeat.pm PBot/Plugins/AntiTwitter.pm PBot/Plugins/AutoRejoin.pm PBot/Plugins/Counter.pm PBot/Plugins/Quotegrabs.pm PBot/Plugins/Quotegrabs/Quotegrabs_Hashtable.pm PBot/Plugins/Quotegrabs/Quotegrabs_SQLite.pm PBot/Plugins/UrlTitles.pm PBot/Plugins/_Example.pm PBot/Refresher.pm PBot/Registerable.pm PBot/Registry.pm PBot/RegistryCommands.pm PBot/SQLiteLogger.pm PBot/SQLiteLoggerLayer.pm PBot/SelectHandler.pm PBot/StdinReader.pm PBot/Timer.pm PBot/Utils/ParseDate.pm PBot/VERSION.pm build/update-version.pl modules/acronym.pl modules/ago.pl modules/c11std.pl modules/c2english.pl modules/c2english/CGrammar.pm modules/c2english/c2eng.pl modules/c99std.pl modules/cdecl.pl modules/cfaq.pl modules/cjeopardy/IRCColors.pm modules/cjeopardy/QStatskeeper.pm modules/cjeopardy/Scorekeeper.pm modules/cjeopardy/cjeopardy.pl modules/cjeopardy/cjeopardy_answer.pl modules/cjeopardy/cjeopardy_filter.pl modules/cjeopardy/cjeopardy_hint.pl modules/cjeopardy/cjeopardy_qstats.pl modules/cjeopardy/cjeopardy_scores.pl modules/cjeopardy/cjeopardy_show.pl modules/codepad.pl modules/compiler_block.pl modules/compiler_client.pl modules/compiler_vm/Diff.pm modules/compiler_vm/cc modules/compiler_vm/compiler_client.pl modules/compiler_vm/compiler_server.pl modules/compiler_vm/compiler_server_vbox_win32.pl modules/compiler_vm/compiler_server_watchdog.pl modules/compiler_vm/compiler_vm_client.pl modules/compiler_vm/compiler_vm_server.pl modules/compiler_vm/compiler_watchdog.pl modules/compiler_vm/languages/_c_base.pm modules/compiler_vm/languages/_default.pm modules/compiler_vm/languages/bash.pm modules/compiler_vm/languages/bc.pm modules/compiler_vm/languages/bf.pm modules/compiler_vm/languages/c11.pm modules/compiler_vm/languages/c89.pm modules/compiler_vm/languages/c99.pm modules/compiler_vm/languages/clang.pm modules/compiler_vm/languages/clang11.pm modules/compiler_vm/languages/clang89.pm modules/compiler_vm/languages/clang99.pm modules/compiler_vm/languages/clangpp.pm modules/compiler_vm/languages/clisp.pm modules/compiler_vm/languages/cpp.pm modules/compiler_vm/languages/freebasic.pm modules/compiler_vm/languages/go.pm modules/compiler_vm/languages/haskell.pm modules/compiler_vm/languages/java.pm modules/compiler_vm/languages/javascript.pm modules/compiler_vm/languages/ksh.pm modules/compiler_vm/languages/lua.pm modules/compiler_vm/languages/perl.pm modules/compiler_vm/languages/python.pm modules/compiler_vm/languages/python3.pm modules/compiler_vm/languages/qbasic.pm modules/compiler_vm/languages/scheme.pm modules/compiler_vm/languages/server/_c_base.pm modules/compiler_vm/languages/server/_default.pm modules/compiler_vm/languages/server/c11.pm modules/compiler_vm/languages/server/c89.pm modules/compiler_vm/languages/server/c99.pm modules/compiler_vm/languages/server/clang.pm modules/compiler_vm/languages/server/clang11.pm modules/compiler_vm/languages/server/clang89.pm modules/compiler_vm/languages/server/clang99.pm modules/compiler_vm/languages/server/cpp.pm modules/compiler_vm/languages/server/freebasic.pm modules/compiler_vm/languages/server/haskell.pm modules/compiler_vm/languages/server/java.pm modules/compiler_vm/languages/server/qbasic.pm modules/compiler_vm/languages/server/tendra.pm modules/compiler_vm/languages/sh.pm modules/compiler_vm/languages/tendra.pm modules/compliment modules/cstd.pl modules/define.pl modules/dice_roll.pl modules/excuse.sh modules/expand_macros.pl modules/fnord.pl modules/funnyish_quote.pl modules/g.pl modules/gdefine.pl modules/gen_cfacts.pl modules/gencstd.pl modules/get_title.pl modules/getcfact.pl modules/google.pl modules/gspy.pl modules/gtop10.pl modules/gtop15.pl modules/headlines.pl modules/horoscope modules/horrorscope modules/ideone.pl modules/insult.pl modules/love_quote.pl modules/man.pl modules/map.pl modules/math.pl modules/prototype.pl modules/qalc.pl modules/random_quote.pl modules/seen.pl modules/urban modules/weather.pl modules/wikipedia.pl pbot.pl pbot.sh It is highly recommended that this list of files is reviewed to ensure that all files are the copyright of the sole maintainer of the repository. If any files with license headers contain the intellectual property of anyone else, it is recommended that a request is made to revise this patch or that the explicit permission of the co-author is gained to allow for the license of the work to be changed. I (Tomasz Kramkowski), the contributor, take no responsibility for any legal action taken against the maintainer of this repository for incorrectly claiming copyright to any work not owned by the maintainer of this repository.
2017-03-05 22:33:31 +01:00
# Quick and dirty by :pragma
2020-07-06 21:08:03 +02:00
# Update: Did I say quick and dirty? I meant lazy and filthy. I should rewrite this completely.
use LWP::UserAgent;
2010-06-27 04:52:38 +02:00
use HTML::Entities;
use Text::Levenshtein qw(fastdistance);
use Time::HiRes qw(gettimeofday);
2020-02-15 23:38:32 +01:00
if ($#ARGV <= 0) {
print "Usage: title nick URL\n";
exit;
}
2020-02-15 23:38:32 +01:00
my $nick = shift(@ARGV);
my $arguments = join("%20", @ARGV);
print STDERR "nick: [$nick], args: [$arguments]\n";
$arguments =~ s/\W$//;
exit if $arguments =~ m{https?://matrix\.to}i;
2021-02-07 23:37:12 +01:00
exit if $arguments =~ m{https?://.*\.c$}i;
exit if $arguments =~ m{https?://.*\.h$}i;
exit if $arguments =~ m{https?://ibb.co/}i;
2021-02-07 23:37:12 +01:00
exit if $arguments =~ m{https?://.*onlinegdb.com}i;
exit if $arguments =~ m{googlesource.com/}i;
2021-07-11 00:00:22 +02:00
exit if $arguments =~ m{https?://git}i and $arguments !~ /commit/i and $arguments !~ /github.com/;
exit if $arguments =~ m{https://.*swissborg.com}i;
2019-08-25 19:46:52 +02:00
exit if $arguments =~ m{https://streamable.com}i;
exit if $arguments =~ m{https://matrix.org}i;
2018-08-06 19:13:18 +02:00
exit if $arguments =~ m{https://freenode.net/news/spam-shake}i;
exit if $arguments =~ m{https://twitter.com/ISCdotORG}i;
exit if $arguments =~ m{https://evestigatorsucks.com}i;
exit if $arguments =~ m{https://MattSTrout.com}i;
exit if $arguments =~ m{https://encyclopediadramatica.rs/Freenodegate}i;
exit if $arguments =~ m{https://bryanostergaard.com}i;
exit if $arguments =~ m{https://williampitcock.com}i;
exit if $arguments =~ m{https?://coliru\..*}i;
exit if $arguments =~ m{https://www.youtube.com/user/l0de/live}i;
exit if $arguments =~ m{localhost}i;
exit if $arguments =~ m{127}i;
exit if $arguments =~ m{192.168}i;
exit if $arguments =~ m{file://}i;
exit if $arguments =~ m{\.\.}i;
2017-04-11 04:24:41 +02:00
exit if $arguments =~ m{https?://www.irccloud.com/pastebin}i;
2018-08-06 19:13:18 +02:00
exit if $arguments =~ m{http://smuj.ca/cl}i;
2017-04-11 04:24:41 +02:00
exit if $arguments =~ m{/man\d+/}i;
exit if $arguments =~ m{godbolt.org}i;
exit if $arguments =~ m{man\.cgi}i;
exit if $arguments =~ m{wandbox}i;
exit if $arguments =~ m{ebay.com/itm}i;
exit if $arguments =~ m/prntscr.com/i;
exit if $arguments =~ m/imgbin.org/i;
exit if $arguments =~ m/jsfiddle.net/i;
exit if $arguments =~ m/port70.net/i;
2015-05-08 14:11:07 +02:00
exit if $arguments =~ m/notabug.org/i;
exit if $arguments =~ m/flickr.com/i;
exit if $arguments =~ m{www.open-std.org/jtc1/sc22/wg14/www/docs/dr}i;
exit if $arguments =~ m/cheezburger/i;
exit if $arguments =~ m/rafb.me/i;
exit if $arguments =~ m/rextester.com/i;
exit if $arguments =~ m/explosm.net/i;
exit if $arguments =~ m/stackoverflow.com/i;
exit if $arguments =~ m/scratch.mit.edu/i;
exit if $arguments =~ m/c-faq.com/i;
exit if $arguments =~ m/imgur.com/i;
exit if $arguments =~ m/sprunge.us/i;
exit if $arguments =~ m/pastebin.ws/i;
exit if $arguments =~ m/hastebin.com/i;
exit if $arguments =~ m/lmgtfy.com/i;
exit if $arguments =~ m/gyazo/i;
exit if $arguments =~ m/imagebin/i;
exit if $arguments =~ m/\/wiki\//i;
2021-07-11 00:00:22 +02:00
exit if $arguments =~ m!github.com/.*/tree/.*/source/.*!i;
exit if $arguments =~ m!github.com/.*/commits/.*!i;
#exit if $arguments =~ m/github.com/i and $arguments !~ m/commit/i;
exit if $arguments =~ m!/blob/!i;
exit if $arguments =~ m/wiki.osdev.org/i;
exit if $arguments =~ m/wikipedia.org/i;
exit if $arguments =~ m/everfall.com/i;
exit if $arguments =~ m/fukung.net/i;
exit if $arguments =~ m/\/paste\//i;
exit if $arguments =~ m/paste\./i;
exit if $arguments =~ m/pastie/i;
exit if $arguments =~ m/ideone.com/i;
exit if $arguments =~ m/codepad.org/i;
exit if $arguments =~ m/^http\:\/\/past(e|ing)\./i;
exit if $arguments =~ m/paste.*\.(?:com|org|net|ch|ca|de|uk|info)/i;
exit if $arguments =~ m/pasting.*\.(?:com|org|net|ca|de|uk|info|ch)/i;
2021-07-11 00:00:22 +02:00
print STDERR "fetching title\n";
my $ua = LWP::UserAgent->new;
if ($arguments =~ /youtube|youtu.be|googlevideo|twitter/) {
$ua->agent("Googlebot");
2021-02-07 23:37:12 +01:00
$ua->max_size(1200 * 1024);
} else {
$ua->agent("Mozilla/5.0");
2021-02-07 23:37:12 +01:00
$ua->max_size(200 * 1024);
}
my $response = $ua->get("$arguments");
2020-02-15 23:38:32 +01:00
if (not $response->is_success) {
#print "Couldn't get link.\n";
use Data::Dumper;
print STDERR Dumper $response;
die "Couldn't get link: $arguments";
}
2019-08-25 19:46:52 +02:00
my $text = $response->decoded_content;
2020-02-15 23:38:32 +01:00
if ($text =~ m/<title>(.*?)<\/title>/msi) { $t = $1; }
else {
use Data::Dumper;
print STDERR Dumper $response;
2021-07-11 00:00:22 +02:00
print STDERR "No title for link.\n";
2020-02-15 23:38:32 +01:00
exit;
}
2020-02-15 23:38:32 +01:00
my $quote = chr(226) . chr(128) . chr(156);
my $quote2 = chr(226) . chr(128) . chr(157);
2020-02-15 23:38:32 +01:00
my $dash = chr(226) . chr(128) . chr(147);
$t =~ s/\s+/ /g;
$t =~ s/^\s+//g;
$t =~ s/\s+$//g;
$t =~ s/<[^>]+>//g;
$t =~ s/<\/[^>]+>//g;
$t =~ s/$quote/"/g;
$t =~ s/$quote2/"/g;
$t =~ s/$dash/-/g;
$t =~ s/&quot;/"/g;
$t =~ s/&#8220;/"/g;
$t =~ s/&#8221;/"/g;
$t =~ s/&amp;/&/g;
$t =~ s/&nsb;/ /g;
$t =~ s/&#39;/'/g;
$t =~ s/&lt;/</g;
$t =~ s/&gt;/>/g;
2010-06-27 04:52:38 +02:00
$t =~ s/&laquo;/<</g;
$t =~ s/&raquo;/>>/g;
$t =~ s/&gt;/>/g;
$t =~ s/&bull;/-/g;
$t =~ s/<em>//g;
$t =~ s/<\/em>//g;
2021-07-11 00:00:22 +02:00
if (length $t > 300) {
$t = substr($t, 0, 300);
2020-02-15 23:38:32 +01:00
$t = "$t [...]";
}
# $nick =~ s/^(.)(.*)/$1|$2/;
2010-06-27 04:52:38 +02:00
$t = decode_entities($t);
2012-07-22 21:22:30 +02:00
$t =~ s/^\s+//;
$t =~ s/\s+$//;
my ($file) = $arguments =~ m/.*\/(.*)$/;
$file =~ s/[_-]/ /g;
my $distance = fastdistance(lc $file, lc $t);
2020-02-15 23:38:32 +01:00
my $length = (length $file > length $t) ? length $file : length $t;
2020-02-15 23:38:32 +01:00
if ($distance / $length < 0.75) { exit; }
2021-07-11 00:00:22 +02:00
print STDERR "passed distance, checking title\n";
2020-02-15 23:38:32 +01:00
exit if $t !~ m/\s/; # exit if title is only one word -- this isn't usually interesting
exit if $t =~ m{christel}i;
exit if $t =~ m{^Loading}i;
2019-08-25 19:46:52 +02:00
exit if $t =~ m{streamable}i;
exit if $t =~ m{freenode}i;
exit if $t =~ m{ico scam}i;
2020-07-06 21:08:03 +02:00
exit if $t =~ m{^IBM Knowledge Center$}i;
2018-08-06 19:13:18 +02:00
exit if $t =~ m{Freenode head of infrastructure}i;
exit if $t =~ m{ISC on Twitter}i;
exit if $t =~ m{spambot.*freenode}i;
exit if $t =~ m{freenode.*spambot}i;
exit if $t =~ m{christel};
exit if $t =~ m/^Coliru Viewer$/i;
2013-08-16 19:28:17 +02:00
exit if $t =~ m/^Gerrit Code Review$/i;
2015-05-08 14:11:07 +02:00
exit if $t =~ m/^Public Git Hosting -/i;
2017-04-11 04:24:41 +02:00
exit if $t =~ m/git\/blob/i;
exit if $t =~ m/\sdiff\s/i;
exit if $t =~ m/- Google Search$/;
exit if $t =~ m/linux cross reference/i;
exit if $t =~ m/screenshot/i;
2014-04-26 17:22:46 +02:00
exit if $t =~ m/pastebin/i;
2017-04-11 04:24:41 +02:00
exit if $t =~ m/past[ea]/i;
2015-05-08 14:11:07 +02:00
exit if $t =~ m/^[0-9_-]+$/;
exit if $t =~ m/^Index of \S+$/;
exit if $t =~ m/(?:sign up|login)/i;
2013-08-16 19:28:17 +02:00
2021-07-11 00:00:22 +02:00
print STDERR "passed spam filters\n";
my @data;
if (open my $fh, "<", "last-title-$nick.dat") {
2020-02-15 23:38:32 +01:00
@data = <$fh>;
close $fh;
2020-02-15 23:38:32 +01:00
chomp $data[0];
exit if $t eq $data[0] and scalar gettimeofday - $data[1] < 1800;
}
open my $fh, ">", "last-title-$nick.dat";
print $fh "$t\n";
print $fh scalar gettimeofday, "\n";
close $fh;
2012-07-22 21:22:30 +02:00
print "Title of $nick\'s link: $t\n" if length $t;