2021-07-10 11:33:39 +02:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
|
|
|
|
# File: wiktionary.pl
|
|
|
|
#
|
|
|
|
# Purpose: Queries Wiktionary website.
|
|
|
|
#
|
|
|
|
# This is a rough first draft. There are more parts of the definitions
|
2021-07-10 12:03:56 +02:00
|
|
|
# to process.
|
2021-07-10 11:33:39 +02:00
|
|
|
|
2021-07-11 00:00:22 +02:00
|
|
|
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
|
|
|
|
# SPDX-License-Identifier: MIT
|
2021-07-10 11:33:39 +02:00
|
|
|
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
|
|
|
|
|
|
|
use Cache::FileCache;
|
2021-07-10 12:03:56 +02:00
|
|
|
use Encode;
|
2021-07-10 11:33:39 +02:00
|
|
|
use Getopt::Long qw/GetOptionsFromArray/;
|
|
|
|
use JSON;
|
|
|
|
|
|
|
|
binmode(STDOUT, ":utf8");
|
|
|
|
|
2021-07-10 12:03:56 +02:00
|
|
|
@ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
|
|
|
|
|
|
|
|
my $usage = "Usage: wiktionary <term> [-e] [-p] [-l <language>] [-n <entry number>]; -e for etymology; -p for pronunciation\n";
|
2021-07-10 11:33:39 +02:00
|
|
|
|
|
|
|
my ($term, $lang, $section, $num, $all, $unique, $opt_e, $opt_p);
|
|
|
|
|
|
|
|
{
|
|
|
|
my $opt_error;
|
|
|
|
local $SIG{__WARN__} = sub {
|
|
|
|
$opt_error = shift;
|
|
|
|
chomp $opt_error;
|
|
|
|
};
|
|
|
|
|
|
|
|
Getopt::Long::Configure("bundling_override");
|
|
|
|
|
|
|
|
GetOptionsFromArray(
|
|
|
|
\@ARGV,
|
|
|
|
'lang|l=s' => \$lang,
|
|
|
|
'section|s=s' => \$section,
|
|
|
|
'num|n=i' => \$num,
|
|
|
|
'all|a' => \$all,
|
|
|
|
'unique|u' => \$unique,
|
|
|
|
'p' => \$opt_p,
|
|
|
|
'e' => \$opt_e,
|
|
|
|
);
|
|
|
|
|
|
|
|
if ($opt_error) {
|
|
|
|
print "$opt_error: $usage\n";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
$term = "@ARGV";
|
|
|
|
|
|
|
|
if (not length $term) {
|
|
|
|
print $usage;
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$lang //= 'English';
|
|
|
|
$section //= 'definitions';
|
|
|
|
|
2021-10-20 06:01:45 +02:00
|
|
|
$all //= 1;
|
|
|
|
|
2021-07-10 11:33:39 +02:00
|
|
|
if ($opt_p and $opt_e) {
|
|
|
|
print "Options -e and -p cannot be used together.\n";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($opt_p) {
|
|
|
|
$unique = 1 unless defined $num;
|
|
|
|
$section = 'pronunciations';
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($opt_e) {
|
|
|
|
$unique = 1 unless defined $num;
|
|
|
|
$section = 'etymology';
|
|
|
|
}
|
|
|
|
|
|
|
|
$num //= 1;
|
|
|
|
|
|
|
|
my $cache = Cache::FileCache->new({ namespace => 'wiktionary', default_expires_in => '1 week' });
|
|
|
|
|
|
|
|
my $cache_id = "$term $lang";
|
|
|
|
|
|
|
|
my $entries = $cache->get($cache_id);
|
|
|
|
|
|
|
|
if (not defined $entries) {
|
|
|
|
my $json = `python3 wiktionary.py \Q$term\E \Q$lang\E`;
|
|
|
|
$entries = decode_json $json;
|
|
|
|
$cache->set($cache_id, $entries);
|
|
|
|
}
|
|
|
|
|
|
|
|
my @valid_sections = qw/definitions etymology pronunciations/;
|
|
|
|
|
|
|
|
if (not grep { $_ eq $section } @valid_sections) {
|
|
|
|
print "Unknown section `$section`. Available sections are: " . join(', ', sort @valid_sections) . "\n";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
my $total_entries_count = @$entries;
|
2021-07-10 12:03:56 +02:00
|
|
|
|
|
|
|
my $entries_text = $section;
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
if ($num > $total_entries_count) {
|
|
|
|
if ($total_entries_count == 1) {
|
|
|
|
$entries_text =~ s/s$//;
|
|
|
|
} else {
|
|
|
|
$entries_text =~ s/y$/ies/;
|
|
|
|
}
|
2021-07-10 11:33:39 +02:00
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
my $are = $total_entries_count == 1 ? 'is' : 'are';
|
|
|
|
print "No such entry $num. There $are $total_entries_count $entries_text for `$term`.\n";
|
2021-07-10 11:33:39 +02:00
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $start = 0;
|
|
|
|
|
|
|
|
if ($num <= 0 or $all or $unique) {
|
2021-07-23 16:25:36 +02:00
|
|
|
$num = $total_entries_count;
|
2021-07-10 11:33:39 +02:00
|
|
|
} else {
|
|
|
|
$start = $num - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
my @results;
|
|
|
|
|
|
|
|
for (my $i = $start; $i < $num; $i++) {
|
|
|
|
my $entry = $entries->[$i];
|
|
|
|
|
|
|
|
if ($section eq 'etymology') {
|
|
|
|
my $ety = $entry->{etymology};
|
|
|
|
chomp $ety;
|
|
|
|
|
|
|
|
if (not length $ety or $ety =~ /This etymology is missing or incomplete./) {
|
|
|
|
$ety = 'N/A';
|
|
|
|
}
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
push @results, $ety unless $ety eq 'N/A' and not $all;
|
2021-07-10 11:33:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($section eq 'pronunciations') {
|
|
|
|
if (exists $entry->{pronunciations}
|
|
|
|
and exists $entry->{pronunciations}->{text})
|
|
|
|
{
|
|
|
|
my $text = join '; ', @{$entry->{pronunciations}->{text}};
|
|
|
|
|
|
|
|
$text = 'N/A' if not length $text;
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
push @results, $text unless $text eq 'N/A' and not $all;
|
2021-07-10 11:33:39 +02:00
|
|
|
} else {
|
|
|
|
push @results, 'N/A';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($section eq 'definitions') {
|
|
|
|
my $text;
|
|
|
|
|
|
|
|
foreach my $definition (@{$entry->{definitions}}) {
|
|
|
|
$text .= "$definition->{partOfSpeech}) ";
|
|
|
|
$text .= join("\n\n", @{$definition->{text}}) . "\n\n";
|
|
|
|
|
|
|
|
if (@{$definition->{examples}}) {
|
|
|
|
$text .= "examples:\n\n";
|
|
|
|
$text .= join("\n\n", @{$definition->{examples}}) . "\n\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
push @results, $text if length $text;
|
2021-07-10 11:33:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
if (not @results) {
|
|
|
|
$entries_text =~ s/y$/ies/;
|
|
|
|
print "There are no $entries_text for $term.\n";
|
|
|
|
exit 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $total_results_count = @results;
|
|
|
|
|
|
|
|
if ($total_results_count == 1) {
|
|
|
|
$entries_text =~ s/s$//;
|
|
|
|
} else {
|
|
|
|
$entries_text =~ s/y$/ies/;
|
|
|
|
}
|
|
|
|
|
|
|
|
print "$total_entries_count entries, " if $total_entries_count > 1;
|
|
|
|
|
|
|
|
print "$total_results_count $entries_text for $term:\n\n";
|
|
|
|
|
2021-07-10 11:33:39 +02:00
|
|
|
if ($unique) {
|
|
|
|
my %uniq;
|
|
|
|
|
|
|
|
my $i = 0;
|
|
|
|
foreach my $result (@results) {
|
|
|
|
$i++;
|
|
|
|
next if not $result or $result eq 'N/A';
|
2021-07-23 16:25:36 +02:00
|
|
|
$uniq{$result} .= "$i,";
|
2021-07-10 11:33:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (not keys %uniq) {
|
|
|
|
print "No $section available for $term.\n";
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
2021-07-23 16:25:36 +02:00
|
|
|
no warnings; # sorting "1,2,3" numerically
|
2021-07-10 11:33:39 +02:00
|
|
|
foreach my $key (sort { $uniq{$a} <=> $uniq{$b} } keys %uniq) {
|
2021-07-23 16:25:36 +02:00
|
|
|
my ($q, $p);
|
|
|
|
$uniq{$key} =~ s/,$//;
|
|
|
|
$uniq{$key} =~ s/\b(\d+)(?{$q=$1+1})(?:,(??{$q})\b(?{$p=$q++})){2,}/$1-$p/g;
|
2021-07-10 11:33:39 +02:00
|
|
|
print "$uniq{$key}) $key\n\n";
|
|
|
|
}
|
2021-07-23 16:25:36 +02:00
|
|
|
use warnings;
|
2021-07-10 11:33:39 +02:00
|
|
|
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $i = @results == 1 ? $num - 1 : 0;
|
|
|
|
foreach my $result (@results) {
|
|
|
|
$i++;
|
|
|
|
next if not $result;
|
|
|
|
print "$i) $result\n\n";
|
|
|
|
}
|