#!/usr/bin/perl -wT # -*- Mode: perl; tab-width: 4; indent-tabs-mode: nil; -*- # # character-identifier: simple lookup of Unicode characters # # Copyright (c) 2004 by Ian Hickson # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use strict; use CGI; use CGI::Carp; use URI::Escape; use lib '.'; use library; # collect data from user my $query = CGI->new(); my $characters = $query->param('characters') || $query->param('bytes') || ''; my @characters = map { ord } $characters =~ m/./gos; my $keywords = $query->param('keywords'); my @keywords = map { quotemeta } split(' ', $keywords || ''); my $showMore = not defined $query->param('hideExtra'); if (@characters) { my $result = ''; my $names = ''; my $entities = ''; my $index = 0; foreach (@characters) { ++$index; $entities .= '&#x' . sprintf('%04x', $_) . ';'; my $code = sprintf('%04X', $_); $result .= sprintf("\nCharacter number $index is decimal %d, hex 0x%02X, octal \\%03o, binary %08b\n\n", $_, $_, $_, $_); addName($code, \$result, \$names); $result .= "\n"; } $result = encode_entities($result); $names = encode_entities($names); my $escapedEntities = encode_entities($entities); $characters = encode_entities($characters); print < Character Identifier: Results

(this script is currently broken)

Characters:

As character names:

$names

As raw characters:

$entities

As a string of HTML entities:

$escapedEntities

In detail:

$result
end } elsif (@keywords) { no utf8; # works around a bug in some versions of perl whereby m/\Q$_\E/ tries to run a local file my @matches; open(FILE, 'NamesList.txt'); # XXX should also search Unihan and Hangul my $data; my $current = undef; my @pendingKeywords = @keywords; while (defined($data = )) { # check if start of new entry if ($data =~ m/^[0-9A-F]{4,5}\t/os) { my $remainingKeywords = 0; foreach (@pendingKeywords) { if (defined $_) { ++$remainingKeywords; last; } } push(@matches, $current) unless $remainingKeywords; @pendingKeywords = @keywords; $current = ''; } elsif ($data !~ m/^\t/os or not defined $current) { # skip non-entry lines next; } $current .= $data; foreach (@pendingKeywords) { if (defined $_ and $data =~ m/$_/si) { $_ = undef; } } } my $remainingKeywords = 0; foreach (@pendingKeywords) { if (defined $_) { ++$remainingKeywords; last; } } push (@matches, $current) unless $remainingKeywords; close(FILE); my $matches; if (@matches) { foreach (@matches) { m/^([0-9A-F]+)\t/os; my $n = hex($1); $_ = encode_entities($_); if ($1 and $showMore) { $_ .= sprintf("\tU+%04X, character &#x%04x;‬, decimal %d, hex 0x%02X, octal \\%03o, binary %08b\n", $n, $n, $n, $n, $n, $n); my @bytes; # XXX this is a very naive implementation if ($n > 0x7F) { while ($n > (1 << 5-scalar(@bytes))) { # need another continuation byte unshift(@bytes, 0b10000000 + ($n & 0b00111111)); $n >>= 6; # shunt out those 6 bits } unshift(@bytes, (0xff & ~((1 << 7 - scalar(@bytes)) - 1)) + $n); } else { push(@bytes, $n); } foreach my $byte (@bytes) { $byte = sprintf('%02x', $byte); } local $" = ' 0x'; my $bytes = uri_escape("0x@bytes"); $_ .= "\tUTF-8: 0x@bytes\n"; } } local $" = "\n"; $matches = "
@matches
"; } else { $matches = '

No matches.

'; } $keywords = encode_entities($keywords); print < Character Finder: Results

Search NamesList.txt for:

$matches end } else { print < Character Identifier

Character Identifier

Enter your characters:

Character Finder

To search the NamesList.txt file, enter your search terms here:

end }