2014-04-26 17:22:46 +02:00
|
|
|
#!/usr/bin/perl -w
|
2009-12-09 02:08:12 +01:00
|
|
|
|
2017-03-05 22:33:31 +01:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
2009-12-09 02:08:12 +01:00
|
|
|
# Quick and dirty by :pragma
|
|
|
|
|
2020-07-06 21:08:03 +02:00
|
|
|
# Update: Did I say quick and dirty? I meant lazy and filthy. I should rewrite this completely.
|
|
|
|
|
2009-12-09 02:08:12 +01:00
|
|
|
use LWP::UserAgent;
|
2010-06-27 04:52:38 +02:00
|
|
|
use HTML::Entities;
|
2013-06-03 19:02:58 +02:00
|
|
|
use Text::Levenshtein qw(fastdistance);
|
2015-09-08 10:25:12 +02:00
|
|
|
use Time::HiRes qw(gettimeofday);
|
2009-12-09 02:08:12 +01:00
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
if ($#ARGV <= 0) {
|
|
|
|
print "Usage: title nick URL\n";
|
|
|
|
exit;
|
2009-12-09 02:08:12 +01:00
|
|
|
}
|
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
my $nick = shift(@ARGV);
|
2015-01-23 22:49:16 +01:00
|
|
|
my $arguments = join("%20", @ARGV);
|
|
|
|
|
2020-07-03 01:35:44 +02:00
|
|
|
print STDERR "nick: [$nick], args: [$arguments]\n";
|
|
|
|
|
2015-01-23 22:49:16 +01:00
|
|
|
$arguments =~ s/\W$//;
|
2009-12-09 02:08:12 +01:00
|
|
|
|
2021-02-07 23:37:12 +01:00
|
|
|
exit if $arguments =~ m{https?://.*\.c$}i;
|
|
|
|
exit if $arguments =~ m{https?://.*\.h$}i;
|
2020-07-03 01:35:44 +02:00
|
|
|
exit if $arguments =~ m{https?://ibb.co/}i;
|
2021-02-07 23:37:12 +01:00
|
|
|
exit if $arguments =~ m{https?://.*onlinegdb.com}i;
|
2020-07-03 01:35:44 +02:00
|
|
|
exit if $arguments =~ m{googlesource.com/}i;
|
2019-05-28 18:19:42 +02:00
|
|
|
exit if $arguments =~ m{https?://git}i;
|
|
|
|
exit if $arguments =~ m{https://.*swissborg.com}i;
|
2019-08-25 19:46:52 +02:00
|
|
|
exit if $arguments =~ m{https://streamable.com}i;
|
2019-05-28 18:19:42 +02:00
|
|
|
exit if $arguments =~ m{https://matrix.org}i;
|
2018-08-06 19:13:18 +02:00
|
|
|
exit if $arguments =~ m{https://freenode.net/news/spam-shake}i;
|
|
|
|
exit if $arguments =~ m{https://twitter.com/ISCdotORG}i;
|
|
|
|
exit if $arguments =~ m{https://evestigatorsucks.com}i;
|
|
|
|
exit if $arguments =~ m{https://MattSTrout.com}i;
|
|
|
|
exit if $arguments =~ m{https://encyclopediadramatica.rs/Freenodegate}i;
|
|
|
|
exit if $arguments =~ m{https://bryanostergaard.com}i;
|
|
|
|
exit if $arguments =~ m{https://williampitcock.com}i;
|
|
|
|
exit if $arguments =~ m{https?://coliru\..*}i;
|
|
|
|
exit if $arguments =~ m{https://www.youtube.com/user/l0de/live}i;
|
|
|
|
exit if $arguments =~ m{localhost}i;
|
|
|
|
exit if $arguments =~ m{127}i;
|
|
|
|
exit if $arguments =~ m{192.168}i;
|
|
|
|
exit if $arguments =~ m{file://}i;
|
|
|
|
exit if $arguments =~ m{\.\.}i;
|
2017-04-11 04:24:41 +02:00
|
|
|
exit if $arguments =~ m{https?://www.irccloud.com/pastebin}i;
|
2018-08-06 19:13:18 +02:00
|
|
|
exit if $arguments =~ m{http://smuj.ca/cl}i;
|
2017-04-11 04:24:41 +02:00
|
|
|
exit if $arguments =~ m{/man\d+/}i;
|
|
|
|
exit if $arguments =~ m{godbolt.org}i;
|
|
|
|
exit if $arguments =~ m{man\.cgi}i;
|
2015-12-23 02:23:01 +01:00
|
|
|
exit if $arguments =~ m{wandbox}i;
|
2015-09-08 10:25:12 +02:00
|
|
|
exit if $arguments =~ m{ebay.com/itm}i;
|
|
|
|
exit if $arguments =~ m/prntscr.com/i;
|
|
|
|
exit if $arguments =~ m/imgbin.org/i;
|
|
|
|
exit if $arguments =~ m/jsfiddle.net/i;
|
|
|
|
exit if $arguments =~ m/port70.net/i;
|
2015-05-08 14:11:07 +02:00
|
|
|
exit if $arguments =~ m/notabug.org/i;
|
|
|
|
exit if $arguments =~ m/flickr.com/i;
|
|
|
|
exit if $arguments =~ m{www.open-std.org/jtc1/sc22/wg14/www/docs/dr}i;
|
|
|
|
exit if $arguments =~ m/cheezburger/i;
|
|
|
|
exit if $arguments =~ m/rafb.me/i;
|
|
|
|
exit if $arguments =~ m/rextester.com/i;
|
2014-08-31 22:26:24 +02:00
|
|
|
exit if $arguments =~ m/explosm.net/i;
|
|
|
|
exit if $arguments =~ m/stackoverflow.com/i;
|
|
|
|
exit if $arguments =~ m/scratch.mit.edu/i;
|
|
|
|
exit if $arguments =~ m/c-faq.com/i;
|
|
|
|
exit if $arguments =~ m/imgur.com/i;
|
|
|
|
exit if $arguments =~ m/sprunge.us/i;
|
|
|
|
exit if $arguments =~ m/pastebin.ws/i;
|
|
|
|
exit if $arguments =~ m/hastebin.com/i;
|
|
|
|
exit if $arguments =~ m/lmgtfy.com/i;
|
|
|
|
exit if $arguments =~ m/gyazo/i;
|
|
|
|
exit if $arguments =~ m/imagebin/i;
|
|
|
|
exit if $arguments =~ m/\/wiki\//i;
|
|
|
|
exit if $arguments =~ m/github.com/i;
|
|
|
|
exit if $arguments =~ m/wiki.osdev.org/i;
|
|
|
|
exit if $arguments =~ m/wikipedia.org/i;
|
|
|
|
exit if $arguments =~ m/everfall.com/i;
|
|
|
|
exit if $arguments =~ m/fukung.net/i;
|
|
|
|
exit if $arguments =~ m/\/paste\//i;
|
|
|
|
exit if $arguments =~ m/paste\./i;
|
|
|
|
exit if $arguments =~ m/pastie/i;
|
|
|
|
exit if $arguments =~ m/ideone.com/i;
|
|
|
|
exit if $arguments =~ m/codepad.org/i;
|
|
|
|
exit if $arguments =~ m/^http\:\/\/past(e|ing)\./i;
|
|
|
|
exit if $arguments =~ m/paste.*\.(?:com|org|net|ch|ca|de|uk|info)/i;
|
|
|
|
exit if $arguments =~ m/pasting.*\.(?:com|org|net|ca|de|uk|info|ch)/i;
|
2009-12-09 02:08:12 +01:00
|
|
|
|
|
|
|
my $ua = LWP::UserAgent->new;
|
2020-07-03 01:35:44 +02:00
|
|
|
if ($arguments =~ /youtube|youtu.be|googlevideo/) {
|
|
|
|
$ua->agent("Googlebot");
|
2021-02-07 23:37:12 +01:00
|
|
|
$ua->max_size(1200 * 1024);
|
2020-07-03 01:35:44 +02:00
|
|
|
} else {
|
|
|
|
$ua->agent("Mozilla/5.0");
|
2021-02-07 23:37:12 +01:00
|
|
|
$ua->max_size(200 * 1024);
|
2020-07-03 01:35:44 +02:00
|
|
|
}
|
2009-12-09 02:08:12 +01:00
|
|
|
|
|
|
|
my $response = $ua->get("$arguments");
|
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
if (not $response->is_success) {
|
|
|
|
|
|
|
|
#print "Couldn't get link.\n";
|
|
|
|
use Data::Dumper;
|
|
|
|
print STDERR Dumper $response;
|
|
|
|
die "Couldn't get link: $arguments";
|
2009-12-09 02:08:12 +01:00
|
|
|
}
|
|
|
|
|
2019-08-25 19:46:52 +02:00
|
|
|
my $text = $response->decoded_content;
|
2009-12-09 02:08:12 +01:00
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
if ($text =~ m/<title>(.*?)<\/title>/msi) { $t = $1; }
|
|
|
|
else {
|
2020-07-03 01:35:44 +02:00
|
|
|
use Data::Dumper;
|
|
|
|
print STDERR Dumper $response;
|
2020-02-15 23:38:32 +01:00
|
|
|
#print "No title for link.\n";
|
|
|
|
exit;
|
2009-12-09 02:08:12 +01:00
|
|
|
}
|
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
my $quote = chr(226) . chr(128) . chr(156);
|
2009-12-09 02:08:12 +01:00
|
|
|
my $quote2 = chr(226) . chr(128) . chr(157);
|
2020-02-15 23:38:32 +01:00
|
|
|
my $dash = chr(226) . chr(128) . chr(147);
|
2009-12-09 02:08:12 +01:00
|
|
|
|
|
|
|
$t =~ s/\s+/ /g;
|
|
|
|
$t =~ s/^\s+//g;
|
|
|
|
$t =~ s/\s+$//g;
|
|
|
|
$t =~ s/<[^>]+>//g;
|
|
|
|
$t =~ s/<\/[^>]+>//g;
|
|
|
|
$t =~ s/$quote/"/g;
|
|
|
|
$t =~ s/$quote2/"/g;
|
|
|
|
$t =~ s/$dash/-/g;
|
|
|
|
$t =~ s/"/"/g;
|
2010-03-22 08:33:44 +01:00
|
|
|
$t =~ s/“/"/g;
|
|
|
|
$t =~ s/”/"/g;
|
2009-12-09 02:08:12 +01:00
|
|
|
$t =~ s/&/&/g;
|
|
|
|
$t =~ s/&nsb;/ /g;
|
|
|
|
$t =~ s/'/'/g;
|
|
|
|
$t =~ s/</</g;
|
|
|
|
$t =~ s/>/>/g;
|
2010-06-27 04:52:38 +02:00
|
|
|
$t =~ s/«/<</g;
|
|
|
|
$t =~ s/»/>>/g;
|
|
|
|
$t =~ s/>/>/g;
|
|
|
|
$t =~ s/•/-/g;
|
2009-12-09 02:08:12 +01:00
|
|
|
$t =~ s/<em>//g;
|
|
|
|
$t =~ s/<\/em>//g;
|
|
|
|
|
2019-05-28 18:19:42 +02:00
|
|
|
if (length $t > 150) {
|
2020-02-15 23:38:32 +01:00
|
|
|
$t = substr($t, 0, 150);
|
|
|
|
$t = "$t [...]";
|
2009-12-09 02:08:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
# $nick =~ s/^(.)(.*)/$1|$2/;
|
|
|
|
|
2010-06-27 04:52:38 +02:00
|
|
|
$t = decode_entities($t);
|
|
|
|
|
2012-07-22 21:22:30 +02:00
|
|
|
$t =~ s/^\s+//;
|
|
|
|
$t =~ s/\s+$//;
|
|
|
|
|
2013-06-03 19:02:58 +02:00
|
|
|
my ($file) = $arguments =~ m/.*\/(.*)$/;
|
|
|
|
$file =~ s/[_-]/ /g;
|
|
|
|
|
|
|
|
my $distance = fastdistance(lc $file, lc $t);
|
2020-02-15 23:38:32 +01:00
|
|
|
my $length = (length $file > length $t) ? length $file : length $t;
|
2013-06-03 19:02:58 +02:00
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
if ($distance / $length < 0.75) { exit; }
|
2013-06-03 19:02:58 +02:00
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
exit if $t !~ m/\s/; # exit if title is only one word -- this isn't usually interesting
|
2019-05-28 18:19:42 +02:00
|
|
|
exit if $t =~ m{christel}i;
|
2020-07-03 01:35:44 +02:00
|
|
|
exit if $t =~ m{^Loading}i;
|
2019-08-25 19:46:52 +02:00
|
|
|
exit if $t =~ m{streamable}i;
|
2019-05-28 18:19:42 +02:00
|
|
|
exit if $t =~ m{freenode}i;
|
|
|
|
exit if $t =~ m{ico scam}i;
|
2020-07-06 21:08:03 +02:00
|
|
|
exit if $t =~ m{^IBM Knowledge Center$}i;
|
2018-08-06 19:13:18 +02:00
|
|
|
exit if $t =~ m{Freenode head of infrastructure}i;
|
|
|
|
exit if $t =~ m{ISC on Twitter}i;
|
|
|
|
exit if $t =~ m{spambot.*freenode}i;
|
|
|
|
exit if $t =~ m{freenode.*spambot}i;
|
|
|
|
exit if $t =~ m{christel};
|
|
|
|
exit if $t =~ m/^Coliru Viewer$/i;
|
2013-08-16 19:28:17 +02:00
|
|
|
exit if $t =~ m/^Gerrit Code Review$/i;
|
2015-05-08 14:11:07 +02:00
|
|
|
exit if $t =~ m/^Public Git Hosting -/i;
|
2017-04-11 04:24:41 +02:00
|
|
|
exit if $t =~ m/git\/blob/i;
|
|
|
|
exit if $t =~ m/\sdiff\s/i;
|
2015-12-23 02:23:01 +01:00
|
|
|
exit if $t =~ m/- Google Search$/;
|
|
|
|
exit if $t =~ m/linux cross reference/i;
|
|
|
|
exit if $t =~ m/screenshot/i;
|
2014-04-26 17:22:46 +02:00
|
|
|
exit if $t =~ m/pastebin/i;
|
2017-04-11 04:24:41 +02:00
|
|
|
exit if $t =~ m/past[ea]/i;
|
2015-05-08 14:11:07 +02:00
|
|
|
exit if $t =~ m/^[0-9_-]+$/;
|
|
|
|
exit if $t =~ m/^Index of \S+$/;
|
2015-12-23 02:23:01 +01:00
|
|
|
exit if $t =~ m/(?:sign up|login)/i;
|
2013-08-16 19:28:17 +02:00
|
|
|
|
2015-09-08 10:25:12 +02:00
|
|
|
my @data;
|
|
|
|
if (open my $fh, "<", "last-title-$nick.dat") {
|
2020-02-15 23:38:32 +01:00
|
|
|
@data = <$fh>;
|
|
|
|
close $fh;
|
2015-09-08 10:25:12 +02:00
|
|
|
|
2020-02-15 23:38:32 +01:00
|
|
|
chomp $data[0];
|
|
|
|
exit if $t eq $data[0] and scalar gettimeofday - $data[1] < 1800;
|
2015-09-08 10:25:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
open my $fh, ">", "last-title-$nick.dat";
|
|
|
|
print $fh "$t\n";
|
|
|
|
print $fh scalar gettimeofday, "\n";
|
|
|
|
close $fh;
|
|
|
|
|
2012-07-22 21:22:30 +02:00
|
|
|
print "Title of $nick\'s link: $t\n" if length $t;
|