pbot/PBot/Utils/ValidateString.pm

113 lines
3.2 KiB
Perl
Raw Normal View History

# File: ValidateString.pm
#
# Purpose: ensures that a given string conforms to PBot's limitations
# for internal strings. This means ensuring the string is not too long,
# does not have undesired characters, etc.
2018-04-09 04:34:24 +02:00
2021-07-11 00:00:22 +02:00
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
# SPDX-License-Identifier: MIT
package PBot::Utils::ValidateString;
2017-09-05 09:18:03 +02:00
use PBot::Imports;
2019-07-11 03:40:53 +02:00
# export validate_string subroutine
2017-09-05 09:18:03 +02:00
require Exporter;
2020-02-15 23:38:32 +01:00
our @ISA = qw/Exporter/;
2017-09-05 09:18:03 +02:00
our @EXPORT = qw/validate_string/;
2018-04-09 04:34:24 +02:00
use JSON;
use Encode;
use Unicode::Truncate;
# validate_string converts a given string to one that conforms to
# PBot's limitations for internal strings. This means ensuring the
# string is not too long, does not have undesired characters, etc.
#
# If the given string contains a JSON structure, it will be parsed
# and each value will be validated. JSON structures must have a depth
# of one level only.
#
# Note that $max_length represents bytes, not characters. The string
# is encoded to utf8, validated, and then decoded back. Truncation
# uses Unicode::Truncate to find the longest Unicode string that can
# fit within $max_length bytes without corruption of the characters.
#
# if $max_length is undefined, it defaults to 8k.
#
# if $max_length is 0, no truncation occurs.
2018-04-09 04:34:24 +02:00
2017-09-05 09:18:03 +02:00
sub validate_string {
2020-02-15 23:38:32 +01:00
my ($string, $max_length) = @_;
2018-04-09 04:34:24 +02:00
if (not defined $string or not length $string) {
# nothing to validate; return as-is.
return $string;
}
# set default max length if none given
$max_length //= 1024 * 8;
2018-04-09 04:34:24 +02:00
2020-02-17 00:55:48 +01:00
local $@;
2020-02-15 23:38:32 +01:00
eval {
# attempt to decode as a JSON string
# throws exception if fails
my $data = decode_json($string);
# no exception thrown, must be JSON.
# so we validate all of its values.
2020-06-29 05:28:54 +02:00
if (not defined $data) {
# decode_json decodes "null" to undef. so we just
# go ahead and return "null" as-is. otherwise, if we allow
# encode_json to encode it back to a string, the string
# will be "{}". bit weird.
return 'null';
2020-06-29 05:28:54 +02:00
}
# validate values
foreach my $key (keys %$data) {
$data->{$key} = validate_this_string($data->{$key}, $max_length);
}
# encode back to a JSON string
$string = encode_json($data);
2020-02-15 23:38:32 +01:00
};
2018-04-09 04:34:24 +02:00
2020-02-15 23:38:32 +01:00
if ($@) {
# not a JSON string, so validate as a normal string.
$string = validate_this_string($string, $max_length);
}
# all validated!
return $string;
}
# validates the string.
# safely performs Unicode truncation given a byte length, handles
# unwanted characters, etc.
sub validate_this_string {
my ($string, $max_length) = @_;
# truncate safely
if ($max_length > 0) {
$string = encode('UTF-8', $string);
$string = truncate_egc $string, $max_length;
2020-02-15 23:38:32 +01:00
}
2018-04-09 04:34:24 +02:00
# allow only these characters.
# TODO: probably going to delete this code.
# replace any extraneous characters with escaped-hexadecimal representation
# $string =~ s/(\P{PosixGraph})/
# my $ch = $1;
# if ($ch =~ m{[\s\x03\x02\x1d\x1f\x16\x0f]}) {
# $ch;
# } else {
# sprintf "\\x%02X", ord $ch;
# }/gxe;
2018-04-09 04:34:24 +02:00
2020-02-15 23:38:32 +01:00
return $string;
2017-09-05 09:18:03 +02:00
}
1;