#!/usr/bin/perl

# Writes a sample /etc/locale.gen file to standard output as a columnated list,
# consisting only of UTF-8 locales supported by the installed version of glibc,
# with comments indicating the languages and territories in plain English.
#
# Requires: column(1)
#
# Copyright 2025 Kerin Millar <kfm@plushkava.net>
# License GPL-2.0-only <https://spdx.org/licenses/GPL-2.0-only.html>

use v5.36;
use File::Spec::Functions qw(catdir catfile);
use Unicode::Normalize qw(NFKD);

use File::Slurper qw(read_dir read_lines read_text);

{
	# The first argument shall be treated as a prefix, if any.
	my $prefix = @ARGV ? $ARGV[0] : '';

	# Read the file containing the supported locale/charmap combinations.
	my @lines = read_lines(catfile($prefix, '/usr/share/i18n', 'SUPPORTED'));

	# Gather the language and territory attributes of the locale templates.
	my $attr_by = map_locale_attributes($prefix);

	# Use column(1) to write out a nicely columnated list.
	my $pipe = open_column("\037");

	my $locale;

	my sub abort ($format) {
		my $path = catfile($prefix, '/usr/share/i18n/locales', $locale);
		die sprintf "mkconfig: $format", $path;
	};

	for my $line (@lines) {
		($locale, my $charmap) = split ' ', $line;

		# The names of the templates do not incorporate a codeset part.
		# Further, as of locale-gen 3.10, the locale.gen(5) file never
		# requires for the locale field to incorporate a codeset.
		$locale =~ s/\.[^@]+//;

		# Select only UTF-8 locales and refrain from incorporating the
		# C.UTF-8 locale because is always compiled by locale-gen(8).
		next if $charmap ne 'UTF-8' || $locale eq 'C';

		# Compose a commented entry which also has a trailing comment,
		# indicating the language and territory in plain English.
		if (! exists $attr_by->{$locale}) {
			abort("Aborting because the '%s' file is missing\n");
		} elsif (! $attr_by->{$locale}->%*) {
			abort("Aborting because no language attribute was found in '%s'\n");
		} else {
			my ($comment, $territory) = $attr_by->{$locale}->@{'language', 'territory'};
			if (length $territory) {
				$comment .= " ($territory)";
			}
			print {$pipe} "# ${locale}\037# ${comment}\n";
		}
	}
	close $pipe or exit 1;
}

sub map_locale_attributes ($prefix) {
	my $top = catdir($prefix, '/usr/share/i18n/locales');
	my @entries = read_dir($top);
	my $regex = qr/
		^
		language  # attribute key
		\h+       # one or more <blank> characters
		"([^"]+)" # non-empty attribute value
		\n        # line break
		territory
		\h+
		"([^"]*)" # attribute value
		$
	/mx;
	my %attr_by;
	for my $locale (@entries) {
		my $data = read_text("$top/$locale");
		my %attr;
		if ($data =~ $regex) {
			@attr{'language', 'territory'} = ($1, ucfirst $2);
			for (values %attr) {
				if (m/[^\p{ASCII}]/) {
					$_ = to_ascii($_);
				}
			}
			for ($attr{'territory'}) {
				if (m/^Myanmar/) {
					$_ = 'Myanmar/Burma';
				} elsif (m/^Russian Federation\z/) {
					$_ = 'Russia';
				} elsif (m/^Turkiye\z/) {
					$_ = 'Turkey';
				}
			}
		}
		$attr_by{$locale} = \%attr;
	}
	return \%attr_by;
}

sub to_ascii ($str) {
	# This behaves similarly to "iconv -f UTF-8 -t US-ASCII//TRANSLIT". At
	# least, to a degree that is sufficient for the inputs being processed.
	$str = NFKD($str);
	$str =~ s/\p{NonspacingMark}//g;
	return $str;
}

sub open_column ($sep) {
	# Ensure that column(1) considers its input stream as US-ASCII.
	local $ENV{'LC_ALL'} = 'C';

	# The encoding is applied as a precaution; over-wide characters written
	# to the pipe become Perl escape sequences e.g. U+00E5 => \x{00e5}.
	if (! open my $pipe, '|-:encoding(US-ASCII)', 'column', '-t', '-s', $sep, '-o', ' ') {
		exit 127;
	} else {
		return $pipe;
	}
}
