mirror of
				https://github.com/pragma-/pbot.git
				synced 2025-11-04 08:37:24 +01:00 
			
		
		
		
	Refactor UrlTitles plugin
Moved logic from get_title.pl applet to plugin. Removed get_title.pl applet. ProcessManager::execute_process() can now take a reader subref.
This commit is contained in:
		
							parent
							
								
									66856441e4
								
							
						
					
					
						commit
						02cc7fc488
					
				
							
								
								
									
										220
									
								
								applets/get_title.pl
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										220
									
								
								applets/get_title.pl
									
									
									
									
										vendored
									
									
								
							@ -1,220 +0,0 @@
 | 
			
		||||
#!/usr/bin/perl -w
 | 
			
		||||
 | 
			
		||||
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
 | 
			
		||||
# SPDX-License-Identifier: MIT
 | 
			
		||||
 | 
			
		||||
# Quick and dirty by :pragma
 | 
			
		||||
 | 
			
		||||
# Update: Did I say quick and dirty? I meant lazy and filthy. I should rewrite this completely.
 | 
			
		||||
 | 
			
		||||
use LWP::UserAgent;
 | 
			
		||||
use HTML::Entities;
 | 
			
		||||
use Text::Levenshtein qw(fastdistance);
 | 
			
		||||
use Time::HiRes qw(gettimeofday);
 | 
			
		||||
 | 
			
		||||
if ($#ARGV <= 0) {
 | 
			
		||||
    print "Usage: title nick URL\n";
 | 
			
		||||
    exit;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
my $nick      = shift(@ARGV);
 | 
			
		||||
my $arguments = join("%20", @ARGV);
 | 
			
		||||
 | 
			
		||||
print STDERR "nick: [$nick], args: [$arguments]\n";
 | 
			
		||||
 | 
			
		||||
$arguments =~ s/\W$//;
 | 
			
		||||
 | 
			
		||||
exit if $arguments =~ m{https?://matrix\.to}i;
 | 
			
		||||
exit if $arguments =~ m{https?://.*\.c$}i;
 | 
			
		||||
exit if $arguments =~ m{https?://.*\.h$}i;
 | 
			
		||||
exit if $arguments =~ m{https?://ibb.co/}i;
 | 
			
		||||
exit if $arguments =~ m{https?://.*onlinegdb.com}i;
 | 
			
		||||
exit if $arguments =~ m{googlesource.com/}i;
 | 
			
		||||
exit if $arguments =~ m{https?://git}i and $arguments !~ /commit/i and $arguments !~ /github.com/;
 | 
			
		||||
exit if $arguments =~ m{https://.*swissborg.com}i;
 | 
			
		||||
exit if $arguments =~ m{https://streamable.com}i;
 | 
			
		||||
exit if $arguments =~ m{https://matrix.org}i;
 | 
			
		||||
exit if $arguments =~ m{https://freenode.net/news/spam-shake}i;
 | 
			
		||||
exit if $arguments =~ m{https://twitter.com/ISCdotORG}i;
 | 
			
		||||
exit if $arguments =~ m{https://evestigatorsucks.com}i;
 | 
			
		||||
exit if $arguments =~ m{https://MattSTrout.com}i;
 | 
			
		||||
exit if $arguments =~ m{https://encyclopediadramatica.rs/Freenodegate}i;
 | 
			
		||||
exit if $arguments =~ m{https://bryanostergaard.com}i;
 | 
			
		||||
exit if $arguments =~ m{https://williampitcock.com}i;
 | 
			
		||||
exit if $arguments =~ m{https?://coliru\..*}i;
 | 
			
		||||
exit if $arguments =~ m{https://www.youtube.com/user/l0de/live}i;
 | 
			
		||||
exit if $arguments =~ m{localhost}i;
 | 
			
		||||
exit if $arguments =~ m{127}i;
 | 
			
		||||
exit if $arguments =~ m{192.168}i;
 | 
			
		||||
exit if $arguments =~ m{file://}i;
 | 
			
		||||
exit if $arguments =~ m{\.\.}i;
 | 
			
		||||
exit if $arguments =~ m{https?://www.irccloud.com/pastebin}i;
 | 
			
		||||
exit if $arguments =~ m{http://smuj.ca/cl}i;
 | 
			
		||||
exit if $arguments =~ m{/man\d+/}i;
 | 
			
		||||
exit if $arguments =~ m{godbolt.org}i;
 | 
			
		||||
exit if $arguments =~ m{man\.cgi}i;
 | 
			
		||||
exit if $arguments =~ m{wandbox}i;
 | 
			
		||||
exit if $arguments =~ m{ebay.com/itm}i;
 | 
			
		||||
exit if $arguments =~ m/prntscr.com/i;
 | 
			
		||||
exit if $arguments =~ m/imgbin.org/i;
 | 
			
		||||
exit if $arguments =~ m/jsfiddle.net/i;
 | 
			
		||||
exit if $arguments =~ m/port70.net/i;
 | 
			
		||||
exit if $arguments =~ m/notabug.org/i;
 | 
			
		||||
exit if $arguments =~ m/flickr.com/i;
 | 
			
		||||
exit if $arguments =~ m{www.open-std.org/jtc1/sc22/wg14/www/docs/dr}i;
 | 
			
		||||
exit if $arguments =~ m/cheezburger/i;
 | 
			
		||||
exit if $arguments =~ m/rafb.me/i;
 | 
			
		||||
exit if $arguments =~ m/rextester.com/i;
 | 
			
		||||
exit if $arguments =~ m/explosm.net/i;
 | 
			
		||||
exit if $arguments =~ m/stackoverflow.com/i;
 | 
			
		||||
exit if $arguments =~ m/scratch.mit.edu/i;
 | 
			
		||||
exit if $arguments =~ m/c-faq.com/i;
 | 
			
		||||
exit if $arguments =~ m/imgur.com/i;
 | 
			
		||||
exit if $arguments =~ m/sprunge.us/i;
 | 
			
		||||
exit if $arguments =~ m/pastebin.ws/i;
 | 
			
		||||
exit if $arguments =~ m/hastebin.com/i;
 | 
			
		||||
exit if $arguments =~ m/lmgtfy.com/i;
 | 
			
		||||
exit if $arguments =~ m/gyazo/i;
 | 
			
		||||
exit if $arguments =~ m/imagebin/i;
 | 
			
		||||
exit if $arguments =~ m/\/wiki\//i;
 | 
			
		||||
exit if $arguments =~ m!github.com/.*/tree/.*/source/.*!i;
 | 
			
		||||
exit if $arguments =~ m!github.com/.*/commits/.*!i;
 | 
			
		||||
#exit if $arguments =~ m/github.com/i and $arguments !~ m/commit/i;
 | 
			
		||||
exit if $arguments =~ m!/blob/!i;
 | 
			
		||||
exit if $arguments =~ m/wiki.osdev.org/i;
 | 
			
		||||
exit if $arguments =~ m/wikipedia.org/i;
 | 
			
		||||
exit if $arguments =~ m/everfall.com/i;
 | 
			
		||||
exit if $arguments =~ m/fukung.net/i;
 | 
			
		||||
exit if $arguments =~ m/\/paste\//i;
 | 
			
		||||
exit if $arguments =~ m/paste\./i;
 | 
			
		||||
exit if $arguments =~ m/pastie/i;
 | 
			
		||||
exit if $arguments =~ m/ideone.com/i;
 | 
			
		||||
exit if $arguments =~ m/codepad.org/i;
 | 
			
		||||
exit if $arguments =~ m/^http\:\/\/past(e|ing)\./i;
 | 
			
		||||
exit if $arguments =~ m/paste.*\.(?:com|org|net|ch|ca|de|uk|info)/i;
 | 
			
		||||
exit if $arguments =~ m/pasting.*\.(?:com|org|net|ca|de|uk|info|ch)/i;
 | 
			
		||||
 | 
			
		||||
print STDERR "fetching title\n";
 | 
			
		||||
 | 
			
		||||
my $ua = LWP::UserAgent->new;
 | 
			
		||||
if ($arguments =~ /youtube|youtu.be|googlevideo|twitter/) {
 | 
			
		||||
    $ua->agent("Googlebot");
 | 
			
		||||
    $ua->max_size(1200 * 1024);
 | 
			
		||||
} else {
 | 
			
		||||
    $ua->agent("Mozilla/5.0");
 | 
			
		||||
    $ua->max_size(200 * 1024);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
my $response = $ua->get("$arguments");
 | 
			
		||||
 | 
			
		||||
if (not $response->is_success) {
 | 
			
		||||
 | 
			
		||||
    #print "Couldn't get link.\n";
 | 
			
		||||
    use Data::Dumper;
 | 
			
		||||
    print STDERR Dumper $response;
 | 
			
		||||
    die "Couldn't get link: $arguments";
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
my $text = $response->decoded_content;
 | 
			
		||||
 | 
			
		||||
if ($text =~ m/<title>(.*?)<\/title>/msi) { $t = $1; }
 | 
			
		||||
else {
 | 
			
		||||
    use Data::Dumper;
 | 
			
		||||
    print STDERR Dumper $response;
 | 
			
		||||
    print STDERR "No title for link.\n";
 | 
			
		||||
    exit;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
my $quote  = chr(226) . chr(128) . chr(156);
 | 
			
		||||
my $quote2 = chr(226) . chr(128) . chr(157);
 | 
			
		||||
my $dash   = chr(226) . chr(128) . chr(147);
 | 
			
		||||
 | 
			
		||||
$t =~ s/\s+/ /g;
 | 
			
		||||
$t =~ s/^\s+//g;
 | 
			
		||||
$t =~ s/\s+$//g;
 | 
			
		||||
$t =~ s/<[^>]+>//g;
 | 
			
		||||
$t =~ s/<\/[^>]+>//g;
 | 
			
		||||
$t =~ s/$quote/"/g;
 | 
			
		||||
$t =~ s/$quote2/"/g;
 | 
			
		||||
$t =~ s/$dash/-/g;
 | 
			
		||||
$t =~ s/"/"/g;
 | 
			
		||||
$t =~ s/“/"/g;
 | 
			
		||||
$t =~ s/”/"/g;
 | 
			
		||||
$t =~ s/&/&/g;
 | 
			
		||||
$t =~ s/&nsb;/ /g;
 | 
			
		||||
$t =~ s/'/'/g;
 | 
			
		||||
$t =~ s/</</g;
 | 
			
		||||
$t =~ s/>/>/g;
 | 
			
		||||
$t =~ s/«/<</g;
 | 
			
		||||
$t =~ s/»/>>/g;
 | 
			
		||||
$t =~ s/>/>/g;
 | 
			
		||||
$t =~ s/•/-/g;
 | 
			
		||||
$t =~ s/<em>//g;
 | 
			
		||||
$t =~ s/<\/em>//g;
 | 
			
		||||
 | 
			
		||||
if (length $t > 300) {
 | 
			
		||||
    $t = substr($t, 0, 300);
 | 
			
		||||
    $t = "$t [...]";
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# $nick =~ s/^(.)(.*)/$1|$2/;
 | 
			
		||||
 | 
			
		||||
$t = decode_entities($t);
 | 
			
		||||
 | 
			
		||||
$t =~ s/^\s+//;
 | 
			
		||||
$t =~ s/\s+$//;
 | 
			
		||||
 | 
			
		||||
my ($file) = $arguments =~ m/.*\/(.*)$/;
 | 
			
		||||
$file =~ s/[_-]/ /g;
 | 
			
		||||
 | 
			
		||||
my $distance = fastdistance(lc $file, lc $t);
 | 
			
		||||
my $length   = (length $file > length $t) ? length $file : length $t;
 | 
			
		||||
 | 
			
		||||
if ($distance / $length < 0.75) { exit; }
 | 
			
		||||
 | 
			
		||||
print STDERR "passed distance, checking title\n";
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
exit if $t !~ m/\s/;                                 # exit if title is only one word -- this isn't usually interesting
 | 
			
		||||
exit if $t =~ m{christel}i;
 | 
			
		||||
exit if $t =~ m{^Loading}i;
 | 
			
		||||
exit if $t =~ m{streamable}i;
 | 
			
		||||
exit if $t =~ m{freenode}i;
 | 
			
		||||
exit if $t =~ m{ico scam}i;
 | 
			
		||||
exit if $t =~ m{^IBM Knowledge Center$}i;
 | 
			
		||||
exit if $t =~ m{Freenode head of infrastructure}i;
 | 
			
		||||
exit if $t =~ m{ISC on Twitter}i;
 | 
			
		||||
exit if $t =~ m{spambot.*freenode}i;
 | 
			
		||||
exit if $t =~ m{freenode.*spambot}i;
 | 
			
		||||
exit if $t =~ m{christel};
 | 
			
		||||
exit if $t =~ m/^Coliru Viewer$/i;
 | 
			
		||||
exit if $t =~ m/^Gerrit Code Review$/i;
 | 
			
		||||
exit if $t =~ m/^Public Git Hosting -/i;
 | 
			
		||||
exit if $t =~ m/git\/blob/i;
 | 
			
		||||
exit if $t =~ m/\sdiff\s/i;
 | 
			
		||||
exit if $t =~ m/- Google Search$/;
 | 
			
		||||
exit if $t =~ m/linux cross reference/i;
 | 
			
		||||
exit if $t =~ m/screenshot/i;
 | 
			
		||||
exit if $t =~ m/pastebin/i;
 | 
			
		||||
exit if $t =~ m/past[ea]/i;
 | 
			
		||||
exit if $t =~ m/^[0-9_-]+$/;
 | 
			
		||||
exit if $t =~ m/^Index of \S+$/;
 | 
			
		||||
exit if $t =~ m/(?:sign up|login)/i;
 | 
			
		||||
 | 
			
		||||
print STDERR "passed spam filters\n";
 | 
			
		||||
 | 
			
		||||
my @data;
 | 
			
		||||
if (open my $fh, "<", "last-title-$nick.dat") {
 | 
			
		||||
    @data = <$fh>;
 | 
			
		||||
    close $fh;
 | 
			
		||||
 | 
			
		||||
    chomp $data[0];
 | 
			
		||||
    exit if $t eq $data[0] and scalar gettimeofday - $data[1] < 1800;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
open my $fh, ">", "last-title-$nick.dat";
 | 
			
		||||
print $fh "$t\n";
 | 
			
		||||
print $fh scalar gettimeofday, "\n";
 | 
			
		||||
close $fh;
 | 
			
		||||
 | 
			
		||||
print "Title of $nick\'s link: $t\n" if length $t;
 | 
			
		||||
@ -109,7 +109,7 @@ sub launch_applet {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (length $stderr) {
 | 
			
		||||
        if (open(my $fh, '>>', "$applet-stderr")) {
 | 
			
		||||
        if (open(my $fh, '>>:encoding(UTF-8)', "$applet-stderr")) {
 | 
			
		||||
            print $fh $stderr;
 | 
			
		||||
            close $fh;
 | 
			
		||||
        } else {
 | 
			
		||||
 | 
			
		||||
@ -55,7 +55,7 @@ sub remove_process {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub execute_process {
 | 
			
		||||
    my ($self, $context, $subref, $timeout) = @_;
 | 
			
		||||
    my ($self, $context, $subref, $timeout, $reader_subref) = @_;
 | 
			
		||||
 | 
			
		||||
    $timeout //= 30; # default timeout 30 seconds
 | 
			
		||||
 | 
			
		||||
@ -141,7 +141,11 @@ sub execute_process {
 | 
			
		||||
        $self->add_process($context->{pid}, $context);
 | 
			
		||||
 | 
			
		||||
        # add reader handler
 | 
			
		||||
        $self->{pbot}->{select_handler}->add_reader($reader, sub { $self->process_pipe_reader($context->{pid}, @_) });
 | 
			
		||||
        if (defined $reader_subref) {
 | 
			
		||||
            $self->{pbot}->{select_handler}->add_reader($reader, sub { $reader_subref->($context->{pid}, @_) });
 | 
			
		||||
        } else {
 | 
			
		||||
            $self->{pbot}->{select_handler}->add_reader($reader, sub { $self->process_pipe_reader($context->{pid}, @_) });
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        # return empty string since reader will handle the output when child is finished
 | 
			
		||||
        return '';
 | 
			
		||||
 | 
			
		||||
@ -111,7 +111,7 @@ sub save {
 | 
			
		||||
        $self->{pbot}->{logger}->log("Saving $self->{name} to $filename\n");
 | 
			
		||||
 | 
			
		||||
        if (not $self->get_data('$metadata$', '$metadata$', 'update_version')) {
 | 
			
		||||
            $self->add('$metadata$', '$metadata$', { update_version => PBot::VERSION::BUILD_REVISION });
 | 
			
		||||
            $self->add('$metadata$', '$metadata$', { update_version => PBot::VERSION::BUILD_REVISION }, 1, 1);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        $self->set('$metadata$', '$metadata$', 'name', $self->{name}, 1);
 | 
			
		||||
 | 
			
		||||
@ -2,7 +2,7 @@
 | 
			
		||||
#
 | 
			
		||||
# Purpose: Display titles of URLs in channel messages.
 | 
			
		||||
 | 
			
		||||
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
 | 
			
		||||
# SPDX-FileCopyrightText: 2021, 2022 Pragmatic Software <pragma78@gmail.com>
 | 
			
		||||
# SPDX-License-Identifier: MIT
 | 
			
		||||
 | 
			
		||||
package PBot::Plugin::UrlTitles;
 | 
			
		||||
@ -10,12 +10,36 @@ use parent 'PBot::Plugin::Base';
 | 
			
		||||
 | 
			
		||||
use PBot::Imports;
 | 
			
		||||
 | 
			
		||||
use Encode;
 | 
			
		||||
use Text::Levenshtein::XS qw(distance);
 | 
			
		||||
use LWP::UserAgent::Paranoid;
 | 
			
		||||
use HTML::Entities;
 | 
			
		||||
use JSON::XS;
 | 
			
		||||
 | 
			
		||||
use constant {
 | 
			
		||||
    TIMEOUT    => 30,
 | 
			
		||||
    USER_AGENT => 'Mozilla/5.0 (compatible)',
 | 
			
		||||
    MAX_SIZE   => 1024 * 200,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
sub initialize {
 | 
			
		||||
    my ($self, %conf) = @_;
 | 
			
		||||
    $self->{pbot}->{registry}->add_default('text',  'general', 'show_url_titles',                 $conf{show_url_titles}                 // 1);
 | 
			
		||||
    $self->{pbot}->{registry}->add_default('array', 'general', 'show_url_titles_channels',        $conf{show_url_titles_channels}        // '.*');
 | 
			
		||||
    $self->{pbot}->{registry}->add_default('array', 'general', 'show_url_titles_ignore_channels', $conf{show_url_titles_ignore_channels} // 'none');
 | 
			
		||||
 | 
			
		||||
    # remember recent titles so we don't repeat them too often
 | 
			
		||||
    my $filename = $self->{pbot}->{registry}->get_value('general', 'data_dir') . '/url-title.hist';
 | 
			
		||||
 | 
			
		||||
    $self->{history} = PBot::Core::Storage::DualIndexHashObject->new(
 | 
			
		||||
        pbot     => $self->{pbot},
 | 
			
		||||
        name     => 'URL title history',
 | 
			
		||||
        filename => $filename,
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    $self->{history}->load;
 | 
			
		||||
 | 
			
		||||
    # can be overridden per-channel
 | 
			
		||||
    $self->{pbot}->{registry}->add_default('text', 'general', 'show_url_titles', $conf{show_url_titles} // 1);
 | 
			
		||||
 | 
			
		||||
    # listen to these handlers
 | 
			
		||||
    $self->{pbot}->{event_dispatcher}->register_handler('irc.public',  sub { $self->show_url_titles(@_) });
 | 
			
		||||
    $self->{pbot}->{event_dispatcher}->register_handler('irc.caction', sub { $self->show_url_titles(@_) });
 | 
			
		||||
}
 | 
			
		||||
@ -26,58 +50,283 @@ sub unload {
 | 
			
		||||
    $self->{pbot}->{event_dispatcher}->remove_handler('irc.caction');
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub show_url_titles {
 | 
			
		||||
    my ($self, $event_type, $event) = @_;
 | 
			
		||||
    my $channel = $event->{event}->{to}[0];
 | 
			
		||||
    my ($nick, $user, $host) = ($event->{event}->nick, $event->{event}->user, $event->{event}->host);
 | 
			
		||||
    my $msg = $event->{event}->{args}[0];
 | 
			
		||||
sub is_ignored_url {
 | 
			
		||||
    my ($self, $url) = @_;
 | 
			
		||||
 | 
			
		||||
    return 0 if not $msg =~ m/https?:\/\/[^\s]/;
 | 
			
		||||
    return 0 if $event->{interpreted};
 | 
			
		||||
	return 1 if $url =~ m{https?://matrix\.to}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://.*\.c$}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://.*\.h$}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://ibb.co/}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://.*onlinegdb.com}i;
 | 
			
		||||
	return 1 if $url =~ m{googlesource.com/}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://git}i and $url !~ /commit/i and $url !~ /github.com/;
 | 
			
		||||
	return 1 if $url =~ m{https://.*swissborg.com}i;
 | 
			
		||||
	return 1 if $url =~ m{https://streamable.com}i;
 | 
			
		||||
	return 1 if $url =~ m{https://matrix.org}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://coliru\..*}i;
 | 
			
		||||
	return 1 if $url =~ m{localhost}i;
 | 
			
		||||
	return 1 if $url =~ m{127}i;
 | 
			
		||||
	return 1 if $url =~ m{192.168}i;
 | 
			
		||||
	return 1 if $url =~ m{file://}i;
 | 
			
		||||
	return 1 if $url =~ m{\.\.}i;
 | 
			
		||||
	return 1 if $url =~ m{https?://www.irccloud.com/pastebin}i;
 | 
			
		||||
	return 1 if $url =~ m{http://smuj.ca/cl}i;
 | 
			
		||||
	return 1 if $url =~ m{/man\d+/}i;
 | 
			
		||||
	return 1 if $url =~ m{godbolt.org}i;
 | 
			
		||||
	return 1 if $url =~ m{man\.cgi}i;
 | 
			
		||||
	return 1 if $url =~ m{wandbox}i;
 | 
			
		||||
	return 1 if $url =~ m{ebay.com/itm}i;
 | 
			
		||||
	return 1 if $url =~ m/prntscr.com/i;
 | 
			
		||||
	return 1 if $url =~ m/imgbin.org/i;
 | 
			
		||||
	return 1 if $url =~ m/jsfiddle.net/i;
 | 
			
		||||
	return 1 if $url =~ m/port70.net/i;
 | 
			
		||||
	return 1 if $url =~ m/notabug.org/i;
 | 
			
		||||
	return 1 if $url =~ m/flickr.com/i;
 | 
			
		||||
	return 1 if $url =~ m{www.open-std.org/jtc1/sc22/wg14/www/docs/dr}i;
 | 
			
		||||
	return 1 if $url =~ m/cheezburger/i;
 | 
			
		||||
	return 1 if $url =~ m/rafb.me/i;
 | 
			
		||||
	return 1 if $url =~ m/rextester.com/i;
 | 
			
		||||
	return 1 if $url =~ m/explosm.net/i;
 | 
			
		||||
	return 1 if $url =~ m/stackoverflow.com/i;
 | 
			
		||||
	return 1 if $url =~ m/scratch.mit.edu/i;
 | 
			
		||||
	return 1 if $url =~ m/c-faq.com/i;
 | 
			
		||||
	return 1 if $url =~ m/imgur.com/i;
 | 
			
		||||
	return 1 if $url =~ m/sprunge.us/i;
 | 
			
		||||
	return 1 if $url =~ m/pastebin.ws/i;
 | 
			
		||||
	return 1 if $url =~ m/hastebin.com/i;
 | 
			
		||||
	return 1 if $url =~ m/lmgtfy.com/i;
 | 
			
		||||
	return 1 if $url =~ m/gyazo/i;
 | 
			
		||||
	return 1 if $url =~ m/imagebin/i;
 | 
			
		||||
	return 1 if $url =~ m/\/wiki\//i;
 | 
			
		||||
	return 1 if $url =~ m!github.com/.*/tree/.*/source/.*!i;
 | 
			
		||||
	return 1 if $url =~ m!github.com/.*/commits/.*!i;
 | 
			
		||||
	return 1 if $url =~ m!/blob/!i;
 | 
			
		||||
	return 1 if $url =~ m/wiki.osdev.org/i;
 | 
			
		||||
	return 1 if $url =~ m/wikipedia.org/i;
 | 
			
		||||
	return 1 if $url =~ m/fukung.net/i;
 | 
			
		||||
	return 1 if $url =~ m/\/paste\//i;
 | 
			
		||||
	return 1 if $url =~ m/paste\./i;
 | 
			
		||||
	return 1 if $url =~ m/pastie/i;
 | 
			
		||||
	return 1 if $url =~ m/ideone.com/i;
 | 
			
		||||
	return 1 if $url =~ m/codepad.org/i;
 | 
			
		||||
	return 1 if $url =~ m/^http\:\/\/past(e|ing)\./i;
 | 
			
		||||
	return 1 if $url =~ m/past(?:e|ing).*\.(?:com|org|net|ch|ca|de|uk|info)/i;
 | 
			
		||||
 | 
			
		||||
    if ($self->{pbot}->{ignorelist}->is_ignored($channel, "$nick!$user\@$host")) {
 | 
			
		||||
    # not ignored
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub is_ignored_title {
 | 
			
		||||
    my ($self, $title) = @_;
 | 
			
		||||
 | 
			
		||||
	return 1 if $title =~ m{^Loading}i;
 | 
			
		||||
	return 1 if $title =~ m{streamable}i;
 | 
			
		||||
	return 1 if $title =~ m{^IBM Knowledge Center$}i;
 | 
			
		||||
	return 1 if $title =~ m{Freenode head of infrastructure}i;
 | 
			
		||||
	return 1 if $title =~ m/^Coliru Viewer$/i;
 | 
			
		||||
	return 1 if $title =~ m/^Gerrit Code Review$/i;
 | 
			
		||||
	return 1 if $title =~ m/^Public Git Hosting -/i;
 | 
			
		||||
	return 1 if $title =~ m/git\/blob/i;
 | 
			
		||||
	return 1 if $title =~ m/\sdiff\s/i;
 | 
			
		||||
	return 1 if $title =~ m/- Google Search$/;
 | 
			
		||||
	return 1 if $title =~ m/linux cross reference/i;
 | 
			
		||||
	return 1 if $title =~ m/screenshot/i;
 | 
			
		||||
	return 1 if $title =~ m/pastebin/i;
 | 
			
		||||
	return 1 if $title =~ m/past[ea]/i;
 | 
			
		||||
	return 1 if $title =~ m/^[0-9_-]+$/;
 | 
			
		||||
	return 1 if $title =~ m/^Index of \S+$/;
 | 
			
		||||
	return 1 if $title =~ m/(?:sign up|login)/i;
 | 
			
		||||
 | 
			
		||||
	# not ignored
 | 
			
		||||
	return  0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub get_title {
 | 
			
		||||
    my ($self, $context) = @_;
 | 
			
		||||
 | 
			
		||||
    my $url = $context->{arguments};
 | 
			
		||||
 | 
			
		||||
    my $ua = LWP::UserAgent::Paranoid->new(request_timeout => TIMEOUT);
 | 
			
		||||
 | 
			
		||||
    $ua->agent(USER_AGENT);
 | 
			
		||||
    $ua->max_size(MAX_SIZE);
 | 
			
		||||
 | 
			
		||||
    my $response = $ua->get($url);
 | 
			
		||||
 | 
			
		||||
    if (not $response->is_success) {
 | 
			
		||||
        $self->{pbot}->{logger}->log("Error getting URL [$url]\n");
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    my $title;
 | 
			
		||||
 | 
			
		||||
    if ($response->title) {
 | 
			
		||||
        $title = decode('UTF-8', $response->title);
 | 
			
		||||
    } else {
 | 
			
		||||
        my $text = $response->decoded_content;
 | 
			
		||||
 | 
			
		||||
        if ($text =~ m/<title>(.*?)<\/title>/msi) {
 | 
			
		||||
            $title = $1;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (not defined $title or not length $title) {
 | 
			
		||||
        $self->{pbot}->{logger}->log("No title for URL [$url]\n");
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    $title = decode_entities($title);
 | 
			
		||||
 | 
			
		||||
    # disregard one-word titles; these aren't usually interesting
 | 
			
		||||
    # (and are usually already present in the URL itself)
 | 
			
		||||
    return 0 if $title !~ /\s/;
 | 
			
		||||
 | 
			
		||||
    # truncate long title
 | 
			
		||||
    if (length $title > 400) {
 | 
			
		||||
        $title = substr($title, 0, 400);
 | 
			
		||||
        $title = "$title [...]";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # fuzzy compare file against title
 | 
			
		||||
    my ($file) = $url =~ m/.*\/(.*)$/;
 | 
			
		||||
    $file =~ s/[_-]+/ /g;
 | 
			
		||||
 | 
			
		||||
    my $distance = distance(lc $file, lc $title);
 | 
			
		||||
    my $length   = (length $file > length $title) ? length $file : length $title;
 | 
			
		||||
 | 
			
		||||
    # disregard title if 75%+ similiar to file
 | 
			
		||||
    return 0 if $distance / $length < 0.75;
 | 
			
		||||
 | 
			
		||||
    # disregard ignored titles
 | 
			
		||||
    return 0 if $self->is_ignored_title($title);
 | 
			
		||||
 | 
			
		||||
    # send result back to parent
 | 
			
		||||
    $context->{result} = $title;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub title_pipe_reader {
 | 
			
		||||
    my ($self, $pid, $buf) = @_;
 | 
			
		||||
 | 
			
		||||
    # retrieve context object from child
 | 
			
		||||
    my $context = decode_json $buf or do {
 | 
			
		||||
        $self->{pbot}->{logger}->log("Failed to decode bad json: [$buf]\n");
 | 
			
		||||
        return;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    # context is no longer forked
 | 
			
		||||
    delete $context->{pid};
 | 
			
		||||
 | 
			
		||||
    my $title = delete $context->{result};
 | 
			
		||||
 | 
			
		||||
    return 0 if not defined $title or not length $title;
 | 
			
		||||
 | 
			
		||||
    # disregard recent titles (15 min)
 | 
			
		||||
    my $data = $self->{history}->get_data($context->{from}, $title);
 | 
			
		||||
 | 
			
		||||
    if (defined $data) {
 | 
			
		||||
        if (time - $data->{timestamp} < 900) {
 | 
			
		||||
            return 0;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # update history
 | 
			
		||||
    $data = {
 | 
			
		||||
        timestamp => time,
 | 
			
		||||
        hostmask => $context->{hostmask},
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    $self->{history}->add($context->{from}, $title, $data, 0, 1);
 | 
			
		||||
 | 
			
		||||
    # set result
 | 
			
		||||
    $context->{result} = "Title of $context->{nick}'s link: $title";
 | 
			
		||||
 | 
			
		||||
    # send result off to bot to be handled
 | 
			
		||||
    $context->{checkflood} = 1;
 | 
			
		||||
    $self->{pbot}->{interpreter}->handle_result($context);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub show_url_titles {
 | 
			
		||||
    my ($self, $event_type, $event) = @_;
 | 
			
		||||
 | 
			
		||||
    my ($nick, $user, $host) = (
 | 
			
		||||
        $event->{event}->nick,
 | 
			
		||||
        $event->{event}->user,
 | 
			
		||||
        $event->{event}->host
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    my ($channel, $msg) = (
 | 
			
		||||
        $event->{event}->{to}[0],
 | 
			
		||||
        $event->{event}->{args}[0]
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    # get show_url_titles for channel or true if not defined
 | 
			
		||||
    my $enabled = $self->{pbot}->{registry}->get_value($channel, 'show_url_titles') // 1;
 | 
			
		||||
 | 
			
		||||
    # disabled in channel
 | 
			
		||||
    return 0 if !$enabled;
 | 
			
		||||
    return 0 if $self->{pbot}->{registry}->get_value($channel, 'no_url_titles');
 | 
			
		||||
 | 
			
		||||
    # disabled globally (unless allowed by channel)
 | 
			
		||||
    return 0 if !$self->{pbot}->{registry}->get_value('general', 'show_url_titles') && !$enabled;
 | 
			
		||||
 | 
			
		||||
    # message already handled by bot command
 | 
			
		||||
    return 0 if $event->{interpreted};
 | 
			
		||||
 | 
			
		||||
    # no url in message
 | 
			
		||||
    return 0 if not $msg =~ m/https?:\/\/[^\s]/;
 | 
			
		||||
 | 
			
		||||
    # ignored user
 | 
			
		||||
    return 0 if $self->{pbot}->{ignorelist}->is_ignored($channel, "$nick!$user\@$host");
 | 
			
		||||
 | 
			
		||||
    # no titles for unidentified users in +z channels
 | 
			
		||||
    my $chanmodes = $self->{pbot}->{channels}->get_meta($channel, 'MODE');
 | 
			
		||||
 | 
			
		||||
    if (defined $chanmodes and $chanmodes =~ m/z/) {
 | 
			
		||||
        my $account  = $self->{pbot}->{messagehistory}->{database}->get_message_account($nick, $user, $host);
 | 
			
		||||
        my $nickserv = $self->{pbot}->{messagehistory}->{database}->get_current_nickserv_account($account);
 | 
			
		||||
        return 0 if not defined $nickserv or not length $nickserv;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (    $self->{pbot}->{registry}->get_value('general', 'show_url_titles')
 | 
			
		||||
        and not $self->{pbot}->{registry}->get_value($channel, 'no_url_titles')
 | 
			
		||||
        and not grep { $channel =~ /$_/i } $self->{pbot}->{registry}->get_value('general', 'show_url_titles_ignore_channels')
 | 
			
		||||
        and grep     { $channel =~ /$_/i } $self->{pbot}->{registry}->get_value('general', 'show_url_titles_channels'))
 | 
			
		||||
    {
 | 
			
		||||
        my $count = 0;
 | 
			
		||||
        while ($msg =~ s/(https?:\/\/[^\s]+)//i && ++$count <= 3) {
 | 
			
		||||
            my $url = $1;
 | 
			
		||||
    my $count = 0;
 | 
			
		||||
 | 
			
		||||
            if ($self->{pbot}->{antispam}->is_spam('url', $url)) {
 | 
			
		||||
                $self->{pbot}->{logger}->log("Ignoring spam URL $url\n");
 | 
			
		||||
                next;
 | 
			
		||||
            }
 | 
			
		||||
    while ($msg =~ s/(https?:\/\/[^\s]+)//i && ++$count <= 3) {
 | 
			
		||||
        my $url = $1;
 | 
			
		||||
 | 
			
		||||
            my $context = {
 | 
			
		||||
                from               => $channel,
 | 
			
		||||
                nick               => $nick,
 | 
			
		||||
                user               => $user,
 | 
			
		||||
                host               => $host,
 | 
			
		||||
                hostmask           => "$nick!$user\@$host",
 | 
			
		||||
                command            => "title $nick $url",
 | 
			
		||||
                root_channel       => $channel,
 | 
			
		||||
                root_keyword       => "title",
 | 
			
		||||
                keyword            => "title",
 | 
			
		||||
                arguments          => "$nick $url",
 | 
			
		||||
                suppress_no_output => 1,
 | 
			
		||||
            };
 | 
			
		||||
        $url =~ s/\W$//;
 | 
			
		||||
		$url =~ s,https://mobile.twitter.com,https://twitter.com,i;
 | 
			
		||||
 | 
			
		||||
            $self->{pbot}->{applets}->execute_applet($context);
 | 
			
		||||
        if ($self->{pbot}->{antispam}->is_spam('url', $url)) {
 | 
			
		||||
            $self->{pbot}->{logger}->log("Ignoring spam URL $url\n");
 | 
			
		||||
            next;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if ($self->is_ignored_url($url)) {
 | 
			
		||||
            $self->{pbot}->{logger}->log("Ignoring URL $url\n");
 | 
			
		||||
            next;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        my $context = {
 | 
			
		||||
            from               => $channel,
 | 
			
		||||
            nick               => $nick,
 | 
			
		||||
            user               => $user,
 | 
			
		||||
            host               => $host,
 | 
			
		||||
            hostmask           => "$nick!$user\@$host",
 | 
			
		||||
            command            => "title $nick $url",
 | 
			
		||||
            root_channel       => $channel,
 | 
			
		||||
            root_keyword       => "title",
 | 
			
		||||
            keyword            => "title",
 | 
			
		||||
            arguments          => $url,
 | 
			
		||||
            suppress_no_output => 1,
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        $self->{pbot}->{process_manager}->execute_process(
 | 
			
		||||
            $context,
 | 
			
		||||
            sub { $self->get_title(@_) },
 | 
			
		||||
            30,
 | 
			
		||||
            sub { $self->title_pipe_reader(@_) },
 | 
			
		||||
        );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -25,8 +25,8 @@ use PBot::Imports;
 | 
			
		||||
# These are set by the /misc/update_version script
 | 
			
		||||
use constant {
 | 
			
		||||
    BUILD_NAME     => "PBot",
 | 
			
		||||
    BUILD_REVISION => 4503,
 | 
			
		||||
    BUILD_DATE     => "2022-02-24",
 | 
			
		||||
    BUILD_REVISION => 4505,
 | 
			
		||||
    BUILD_DATE     => "2022-02-27",
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
sub initialize {}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user