3
0
mirror of https://github.com/pragma-/pbot.git synced 2025-01-10 20:12:35 +01:00
pbot/applets/wiktionary.pl

315 lines
7.6 KiB
Perl
Raw Normal View History

2021-07-10 11:33:39 +02:00
#!/usr/bin/env perl
# File: wiktionary.pl
#
# Purpose: Queries Wiktionary website.
#
# This is a rough first draft. There are more parts of the definitions
# to process.
2021-07-10 11:33:39 +02:00
2021-07-11 00:00:22 +02:00
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
# SPDX-License-Identifier: MIT
2021-07-10 11:33:39 +02:00
use warnings;
use strict;
use Cache::FileCache;
use Encode;
2021-07-10 11:33:39 +02:00
use Getopt::Long qw/GetOptionsFromArray/;
use JSON;
2022-06-28 05:35:02 +02:00
sub flatten { map { ref eq 'ARRAY' ? flatten(@$_) : $_ } @_ }
2021-07-10 11:33:39 +02:00
binmode(STDOUT, ":utf8");
@ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
my $usage = "Usage: wiktionary <term> [-pos <part of speech>] [-e] [-p] [-r [relation]] [-l <language>] [-n <entry number>]; -e for etymology; -p for pronunciation\n";
2021-07-10 11:33:39 +02:00
my ($term, $lang, $section, $num, $all, $unique, $opt_e, $opt_p, $part_of_speech, $relation);
2021-07-10 11:33:39 +02:00
{
my $opt_error;
local $SIG{__WARN__} = sub {
$opt_error = shift;
chomp $opt_error;
};
Getopt::Long::Configure("bundling_override");
GetOptionsFromArray(
\@ARGV,
'lang|l=s' => \$lang,
'section|s=s' => \$section,
'num|n=i' => \$num,
'all|a' => \$all,
'unique|u' => \$unique,
'pos=s' => \$part_of_speech,
'r:s' => \$relation,
2021-07-10 11:33:39 +02:00
'p' => \$opt_p,
'e' => \$opt_e,
);
if ($opt_error) {
print "$opt_error: $usage\n";
exit 1;
}
$term = "@ARGV";
if (not length $term) {
print $usage;
exit 1;
}
}
$lang //= 'English';
$section //= 'definitions';
$all //= 1;
2021-07-10 11:33:39 +02:00
if ($opt_p and $opt_e) {
print "Options -e and -p cannot be used together.\n";
exit 1;
}
if ($opt_p) {
$unique = 1 unless defined $num;
$section = 'pronunciations';
}
if ($opt_e) {
$unique = 1 unless defined $num;
$section = 'etymology';
}
$num //= 1;
my $cache = Cache::FileCache->new({ namespace => 'wiktionary', default_expires_in => '1 week' });
2023-03-19 22:50:52 +01:00
my $cache_id = encode('UTF-8', "$term $lang");
2021-07-10 11:33:39 +02:00
my $entries = $cache->get($cache_id);
if (not defined $entries) {
my $json = `python3 wiktionary.py \Q$term\E \Q$lang\E`;
$entries = eval { decode_json $json };
if ($@) {
print "$json\n";
print "$@\n";
die;
}
2021-07-10 11:33:39 +02:00
$cache->set($cache_id, $entries);
}
2022-06-28 05:35:02 +02:00
if ($ENV{DEBUG}) {
use Data::Dumper;
print Dumper($entries), "\n";
}
2024-10-03 05:13:45 +02:00
my @valid_sections = qw/definitions etymology pronunciations/;
2021-07-10 11:33:39 +02:00
if (not grep { $_ eq $section } @valid_sections) {
print "Unknown section `$section`. Available sections are: " . join(', ', sort @valid_sections) . "\n";
exit 1;
}
2022-06-28 05:35:02 +02:00
my $entries_text = $section;
if (ref $entries eq 'HASH') {
$entries_text =~ s/y$/ies/;
print "No $entries_text for `$term`";
if ($entries->{languages}->@*) {
print " in $lang; try ", (join ', ', $entries->{languages}->@*);
} else {
print " found in any languages";
}
if ($entries->{disambig}->@*) {
print "; see also: ", (join ', ', $entries->{disambig}->@*);
}
print "\n";
exit;
}
my $total_entries_count = @$entries;
2022-06-28 05:35:02 +02:00
if ($total_entries_count == 0) {
$entries_text =~ s/y$/ies/;
print "No $entries_text for `$term`.\n";
exit 1;
}
if ($num > $total_entries_count) {
if ($total_entries_count == 1) {
$entries_text =~ s/s$//;
} else {
$entries_text =~ s/y$/ies/;
}
2021-07-10 11:33:39 +02:00
my $are = $total_entries_count == 1 ? 'is' : 'are';
print "No such entry $num. There $are $total_entries_count $entries_text for `$term`.\n";
2021-07-10 11:33:39 +02:00
exit 1;
}
my $start = 0;
if ($num <= 0 or $all or $unique) {
$num = $total_entries_count;
2021-07-10 11:33:39 +02:00
} else {
$start = $num - 1;
}
my @results;
my %parts_of_speech;
my %relationships;
2021-07-10 11:33:39 +02:00
for (my $i = $start; $i < $num; $i++) {
my $entry = $entries->[$i];
if ($section eq 'etymology') {
my $ety = $entry->{etymology};
chomp $ety;
if (not length $ety or $ety =~ /This etymology is missing or incomplete./) {
$ety = 'N/A';
}
push @results, $ety unless $ety eq 'N/A' and not $all;
2021-07-10 11:33:39 +02:00
}
elsif ($section eq 'pronunciations') {
if (exists $entry->{pronunciations}
and exists $entry->{pronunciations}->{text})
{
my $text = join '; ', @{$entry->{pronunciations}->{text}};
$text = 'N/A' if not length $text;
push @results, $text unless $text eq 'N/A' and not $all;
2021-07-10 11:33:39 +02:00
} else {
push @results, 'N/A';
}
}
elsif ($section eq 'definitions') {
my $text;
foreach my $definition (@{$entry->{definitions}}) {
$parts_of_speech{$definition->{partOfSpeech}} = 1;
next if defined $part_of_speech && $definition->{partOfSpeech} ne $part_of_speech;
2024-10-03 05:13:45 +02:00
my $relations = '';
if (defined $relation) {
2024-10-03 05:13:45 +02:00
foreach my $rel ($definition->{relatedWords}->@*) {
$relationships{$rel->{relationshipType}} = 1;
if (!length $relation || $relation eq '*' || $rel->{relationshipType} eq $relation) {
2024-10-03 05:13:45 +02:00
if ($rel->{words}->@*) {
$relations .= "$rel->{relationshipType}) ";
$relations .= join (", ", $rel->{words}->@*);
$relations .= "\n\n";
}
}
}
2024-10-03 05:13:45 +02:00
next if not length $relations;
}
2024-10-03 05:13:45 +02:00
my $entry = -1;
$text .= "$definition->{partOfSpeech}) ";
foreach my $def (flatten @{$definition->{text}}) {
$def =~ s/^#//;
$text .= "$def\n";
if (!defined $relation && @{$definition->{examples}}) {
foreach my $example (@{$definition->{examples}}) {
if ($example->{index} == $entry) {
$text .= " ($example->{text})\n";
}
}
}
2024-10-03 05:13:45 +02:00
$entry++;
$text .= "\n";
}
2024-10-03 05:13:45 +02:00
$text .= $relations;
2021-07-10 11:33:39 +02:00
}
push @results, $text if length $text;
2021-07-10 11:33:39 +02:00
}
}
if (not @results) {
if (defined $part_of_speech) {
2024-10-03 05:13:45 +02:00
if (not exists $parts_of_speech{$part_of_speech}) {
print "There is no $part_of_speech available for `$term`.\n";
my @pos = sort keys %parts_of_speech;
if (@pos) {
print 'Try ', join (', ', @pos), ".\n";
}
exit 1;
}
}
2024-10-03 05:13:45 +02:00
if (defined $relation) {
$relation = 'relations' if not length $relation;
if (not exists $relationships{$relation}) {
print "There are no $relation available for `$term`.\n";
my @relations = sort keys %relationships;
2024-10-03 05:13:45 +02:00
if (@relations) {
print 'Try ', join (', ', @relations), ".\n";
}
2024-10-03 05:13:45 +02:00
exit 1;
}
}
2024-10-03 05:13:45 +02:00
print "There is no $entries_text available for `$term`.\n";
2022-06-28 05:35:02 +02:00
exit 1;
}
2021-07-10 11:33:39 +02:00
if ($unique) {
my %uniq;
my $i = 0;
foreach my $result (@results) {
$i++;
next if not $result or $result eq 'N/A';
$uniq{$result} .= "$i,";
2021-07-10 11:33:39 +02:00
}
if (not keys %uniq) {
print "No $section available for $term.\n";
exit;
}
no warnings; # sorting "1,2,3" numerically
2021-07-10 11:33:39 +02:00
foreach my $key (sort { $uniq{$a} <=> $uniq{$b} } keys %uniq) {
my ($q, $p);
$uniq{$key} =~ s/,$//;
$uniq{$key} =~ s/\b(\d+)(?{$q=$1+1})(?:,(??{$q})\b(?{$p=$q++})){2,}/$1-$p/g;
2021-07-10 11:33:39 +02:00
print "$uniq{$key}) $key\n\n";
}
use warnings;
2021-07-10 11:33:39 +02:00
exit;
}
my $i = @results == 1 ? $num - 1 : 0;
foreach my $result (@results) {
$i++;
next if not $result;
print "$i) $result\n";
2021-07-10 11:33:39 +02:00
}