zonemaster.es/zonemaster-engine/t/normalization.t

#!perl
use strict;
use utf8;
use warnings;
use Test::More;

use Test::Differences;
use Test::Exception;
use Zonemaster::Engine::Normalization qw( normalize_name trim_space );

sub char_to_hex_esc {
    my ($char) = @_;
    my $ord = ord($char);
    if ($ord >= 32 && $ord <= 127) {
        return $char;
    } elsif ($ord <= 255) {
        return sprintf('\x%02X', $ord);
    } else {
        return sprintf('\x{%04X}', $ord);
    }
}

sub to_hex_esc {
    my ($str) = @_;
    return join('', map({ char_to_hex_esc($_) } (split //, $str)));
}

subtest 'Valid domains' => sub {
    my %input_domains = (
        # Roots
        '.' => '.',  # Full stop
        '．' => '.', # Fullwidth full stop
        '。' => '.', # Ideographic full stop
        '｡' => '.',  # Halfwidth ideographic full stop

        # Mixed dots with trailing dot
        'example。com.' => 'example.com',
        'example｡com．' => 'example.com',
        'sub.example．com｡' => 'sub.example.com',
        'sub．example.com。' => 'sub.example.com',

        # Mixed dots without trailing dot
        'example。com' => 'example.com',
        'example｡com' => 'example.com',
        'sub.example．com' => 'sub.example.com',
        'sub．example.com' => 'sub.example.com',

        # Domains with U-Labels
        'café.example.com' => 'xn--caf-dma.example.com',
        'エグザンプル｡example｡com' => 'xn--ickqs6k2dyb.example.com',
        'αβγδε.example.com' => 'xn--mxacdef.example.com',

        # Domains with uppercase unicode
        'CafÉ.example.com' => 'xn--caf-dma.example.com',
        'ΑβΓΔε.example.com' => 'xn--mxacdef.example.com',

        # All ascii domains (lowercase)
        'example.com' => 'example.com',
        '0/28.2.0.192.example.com' => '0/28.2.0.192.example.com',
        '_http._tcp.example.com.' => '_http._tcp.example.com',
        'sub-domain.example.com' => 'sub-domain.example.com',

        # All ascii domains with uppercase characters
        'suB-doMaIN.ExamPlE.cOm' => 'sub-domain.example.com',

        # Single label domains
        'test' => 'test',
        'テスト' => 'xn--zckzah',

        # Length limits
        "a" x 63 . ".example.com" => "a" x 63 . ".example.com",
        # this is 253 characters
        ("a" x 15 . ".") x 15 . "b" . ".example.com" => ("a" x 15 . ".") x 15 . "b" . ".example.com",

        # NFC conversion (for each group first is non-NFC, second is equivalent NFC)
        "d\x{006F}\x{0308}d" => 'xn--dd-fka',
        'död' => 'xn--dd-fka',

        "aq\x{0307}\x{0323}a" => 'xn--aqa-9dc3l',
        "aq\x{0323}\x{0307}a" => 'xn--aqa-9dc3l',

        "aḋ\x{0323}a" => 'xn--aa-rub587y',
        "aḍ\x{0307}a" => 'xn--aa-rub587y',
    );

    for my $domain ( sort keys %input_domains ) {
        my $expected_output = $input_domains{$domain};
        my $safe_domain = to_hex_esc($domain);
        subtest "Domain: '$safe_domain'" => sub {
            my ( $errors, $final_domain );
            lives_ok(sub {
                ($errors, $final_domain) = normalize_name($domain);
            }, 'correct domain should live');

            my $actual   = { domain => $final_domain,    errors => $errors };
            my $expected = { domain => $expected_output, errors => [] };

            eq_or_diff $actual, $expected;
        }
    }
};

subtest 'Bad domains' => sub {
    my %input_domains = (
        # Empty labels
        '.。．'           => ['INITIAL_DOT'],
        'example。.com.' => ['REPEATED_DOTS'],
        'example。com.｡' => ['REPEATED_DOTS'],
        '．.example｡com' => ['INITIAL_DOT'],

        # Bad ascii
        'bad:%;!$.example.com.'            => ['INVALID_ASCII'],
        " \x{205F} example.com.  \x{0009}" => ['INVALID_ASCII'],
        '    '                             => ['INVALID_ASCII'],

        # Label to long
        "a" x 64 . ".example.com" => ['LABEL_TOO_LONG'],
        # Length too long after idn conversion (libidn fails)
        'チョコレート' x 8 . 'a' . '.example.com' => ['INVALID_U_LABEL'],
        # Emoji in names are invalid as per IDNA2008
        '❤️．example．com' => ['INVALID_U_LABEL'],

        # Domain to long
        # this is 254 characters
        ( "a" x 15 . "." ) x 15 . "bc" . ".example.com" => ['DOMAIN_NAME_TOO_LONG'],

        # Empty domain
        '' => ['EMPTY_DOMAIN_NAME'],

        # Ambiguous downcasing
        'İ.example.com' => ['AMBIGUOUS_DOWNCASING'],
    );

    for my $domain ( sort keys %input_domains ) {
        my $expected_errors = $input_domains{$domain};
        my $safe_domain = to_hex_esc( $domain );
        subtest "Domain: '$safe_domain'" => sub {
            my ( $errors, $final_domain );
            lives_ok(sub {
                ($errors, $final_domain) = normalize_name($domain);
            }, 'incorrect domain should live');

            my $actual = {
                domain => $final_domain,
                errors => [ map { $_->tag } @$errors ]
            };
            my $expected = {
                domain => undef,
                errors => $expected_errors
            };

            eq_or_diff $actual, $expected;
        }
    }
};

subtest 'Trimming space' => sub {
    my %cases = (
        "example."                     => 'example.',
        "exam   ."                     => 'exam   .',
        " \x{205F} example.  \x{0009}" => 'example.',
    );

    for my $str ( sort keys %cases ) {
        my $expected = $cases{$str};

        my $safe_str = to_hex_esc($str);
        subtest "Domain: '$safe_str'" => sub {
            my $actual = trim_space( $str );

            is $actual, $expected, 'Match expected string';
        }
    }
};

done_testing;