#!/usr/bin/perl -wT # -*- Mode: perl; tab-width: 4; indent-tabs-mode: nil; -*- # # utf8-decoder: Converting UTF-8 octal bytes to Unicode codepoints # # Copyright (c) 2004 by Ian Hickson # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use strict; use CGI; use CGI::Carp; use lib '.'; use library; # collect data from user my $query = CGI->new(); my $bytes = $query->param('bytes') || ''; my $encoding = $query->param('encoding') || ''; my @bytes; #warn "asked to decode:\n$bytes\n"; if (length($bytes) > 0 and length($bytes) < 1024) { if ($encoding eq 'windows1252') { @bytes = map { unpack('C', $_) } $bytes =~ m/./gos; } elsif ($encoding eq 'hex') { $bytes =~ s/[^0-9A-Fa-f]//gos; # remove any non-numeric characters @bytes = map { hex } $bytes =~ m/..?/gos; } elsif ($encoding eq 'binary') { $bytes =~ s/[^01 \t\r\n]//gos; while ($bytes =~ m/\G[^01]*([01]+)[^01]*/gos) { push(@bytes, oct "0b$1"); } } elsif ($encoding eq 'embedded') { while ($bytes =~ m/\G(\\[0-7]+|\\x[0-9A-Fa-f]|\\b[01]+|\\d[0-9]+|.)/gos) { $_ = $1; s/^\\d(.+)$/$1.0/os; s/^\\(.+)$/0$1/os; push(@bytes, length > 1 ? oct : ord); } } else { $bytes =~ s/[^\\xb0-9A-Fa-f]/ /gos; # remove any non-numeric characters $bytes =~ s/\\/ 0/gos; # replace backslashes with a 'non-decimal' prefix $bytes =~ s/0?x/0x/gos; # treat xABC as 0xABC foreach (split(' ', $bytes)) { push(@bytes, m/^0/os ? oct : $_); } } } if (@bytes) { my $result = ''; my $entities = ''; my $names = ''; my $remaining = 0; my $count = 0; my $scratch = 0; my $index = 0; foreach (@bytes) { my $raw = $_; ++$index; eval { if (!m/^[0-9]+$/os or $_ < 0 or $_ > 255) { $result .= sprintf("\nByte number $index is '$_'\n"); die "Not a byte (value out of range).\n"; } $result .= sprintf("\nByte number $index is decimal %d, hex 0x%02X, octal \\%03o, binary %08b\n", $_, $_, $_, $_); if ($_ == 0xFE or $_ == 0xFF) { # UTF-8 validity test die "Not a valid UTF-8 byte.\n"; } if (($_ & 0b11000000) == 0b10000000) { # continuation byte # check we're expecting one if (not $remaining) { die "Unexpected continuation byte.\n"; } my $r = $remaining - 1; $result .= "This is continuation byte $count, expecting $r more.\n"; # strip off the high bit $_ &=~ 0b10000000; } else { # check we're not expecting more continuation bytes if ($remaining) { $result .= "Previous UTF-8 multibyte sequence incomplete, earlier bytes dropped.\n"; } # count how many leading bits are on my $bit = 7; $remaining = 0; $count = 0; $scratch = 0; while (($bit >= 0) and ($_ & (1 << $bit)) > 0) { # one more byte expected ++$remaining; # turn off the bit $_ &=~ (1 << $bit); # ready for next bit --$bit; } # $remaining must be 0, 2, 3, 4, 5, or 6 $remaining = 1 if $remaining == 0; if ($remaining < 1 or $remaining > 6) { die "Not a valid UTF-8 byte (internal error during decoding: remaining == $remaining).\n"; } if ($remaining > 1) { $result .= "This is the first byte of a $remaining byte sequence.\n"; } } # add the current byte to the pending number --$remaining; # if ($count == 1 and not $_ and not $scratch) { # XXX need a reliable way of telling that this is an overlong byte sequence # note that U+1047E produces a first byte which is (11110)(000), so you need to check more than one byte # but note that 0b11100000 0b10000001 0b10000001 is overlong # $result .= "Warning: This is an overlong multibyte sequence.\n" # } ++$count; $scratch += $_ << (6 * $remaining); if (not $remaining) { $entities .= '&#x' . sprintf('%04x', $scratch) . ';'; $scratch = sprintf('%04X', $scratch); $result .= "\n"; addName($scratch, \$result, \$names); $result .= "\n"; } }; if ($@) { $result .= $@; if ($remaining) { $result .= "Premature end of multibyte sequence, some bytes dropped\n"; $remaining = 0; } } } if ($remaining) { $result .= "End of file during multibyte sequence, some bytes dropped\n"; } if (not length $entities) { $entities = '(none)'; $names = '(none)'; } $result = encode_entities($result); $names = encode_entities($names); my $escapedEntities = encode_entities($entities); print < utf8-decoder: Results

As character names:

$names

As raw characters:

$entities

As a string of HTML entities:

$escapedEntities

Decoder output:

$result
end } elsif (length($bytes)) { print < utf8-decoder: Results

You did not include any bytes. Make sure you selected the right submission type.

end } else { print < utf8-decoder

UTF-8 Decoder

Input type:

Enter your bytes:

Note: Non-numeric characters in "freeform numeric" and "hexadecimal" modes are silently stripped. In "binary" mode, bytes must be separated from each by spaces, tabs, or newlines; other characters are stripped too. If you are getting strange results, double check that you selected the right mode.

You can enter bytes in any of the following forms:

Embedded
Raw ASCII text with UTF-8 encoded characters represented by backslash escapes:
  • Hexadecimal: \\x12 \\x34 \\x56 \\x78
  • Decimal: \\d123 \\d45 \\d67
  • Octal: \\123 \\45
  • Binary: \\b01010101 \\b10101010 \\b11110000
Freeform numeric
Space separated bytes in one or more of the following numeric forms:
  • Hexadecimal: 0x12 0x34 0x56 0x78 or x12 x34 x56 x78
  • Decimal: 123 45 67
  • Octal, using either backslashes or leading 0s: \\123 \\45 or 0123 045
  • Binary: 0b01010101 0b10101010 0b11110000
Hexadecimal
A hex dump. Bytes are extracted two characters at a time, with all whitespace ignored, e.g. 123456789ABCDE.
Binary
Space-separated list of binary numbers, e.g. 01010101 10101010 11110000
UTF-8 interpreted as Windows-1252
Raw UTF-8 encoded text, but interpreted as Windows-1252. For example, if your source viewer only supports Windows-1252, but the page is encoded as UTF-8, you can select text from your source viewer, paste it here, and see what the characters really are.
end }