#!/usr/bin/perl -wT
# -*- Mode: perl; tab-width: 4; indent-tabs-mode: nil; -*-
#
# utf8-decoder: Converting UTF-8 octal bytes to Unicode codepoints
#
# Copyright (c) 2004 by Ian Hickson
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

use strict;
use CGI;
use CGI::Carp;
use lib '.';
use library;

# collect data from user
my $query = CGI->new();
my $bytes = $query->param('bytes') || '';
my $encoding = $query->param('encoding') || '';

my @bytes;

#warn "asked to decode:\n$bytes\n";

if (length($bytes) > 0 and length($bytes) < 1024) {
    if ($encoding eq 'windows1252') {
        @bytes = map { unpack('C', $_) } $bytes =~ m/./gos;
    } elsif ($encoding eq 'hex') {
        $bytes =~ s/[^0-9A-Fa-f]//gos; # remove any non-numeric characters
        @bytes = map { hex } $bytes =~ m/..?/gos;
    } elsif ($encoding eq 'binary') {
        $bytes =~ s/[^01 \t\r\n]//gos;
        while ($bytes =~ m/\G[^01]*([01]+)[^01]*/gos) {
            push(@bytes, oct "0b$1");
        }
    } elsif ($encoding eq 'embedded') {
        while ($bytes =~ m/\G(\\[0-7]+|\\x[0-9A-Fa-f]|\\b[01]+|\\d[0-9]+|.)/gos) {
            $_ = $1;
            s/^\\d(.+)$/$1.0/os;
            s/^\\(.+)$/0$1/os;
            push(@bytes, length > 1 ? oct : ord);
        }
    } else {
        $bytes =~ s/[^\\xb0-9A-Fa-f]/ /gos; # remove any non-numeric characters
        $bytes =~ s/\\/ 0/gos; # replace backslashes with a 'non-decimal' prefix
        $bytes =~ s/0?x/0x/gos; # treat xABC as 0xABC
        foreach (split(' ', $bytes)) {
            push(@bytes, m/^0/os ? oct : $_);
        }
    }
}

if (@bytes) {
    my $result = '';
    my $entities = '';
    my $names = '';
    my $remaining = 0;
    my $count = 0;
    my $scratch = 0;
    my $index = 0;
    foreach (@bytes) {
        my $raw = $_;
        ++$index;
        eval {
            if (!m/^[0-9]+$/os or $_ < 0 or $_ > 255) {
                $result .= sprintf("\nByte number $index is '$_'\n");
                die "Not a byte (value out of range).\n";
            }
            $result .= sprintf("\nByte number $index is decimal %d, hex 0x%02X, octal \\%03o, binary %08b\n", $_, $_, $_, $_);
            if ($_ == 0xFE or $_ == 0xFF) { # UTF-8 validity test
                die "Not a valid UTF-8 byte.\n";
            }
            if (($_ & 0b11000000) == 0b10000000) {
                # continuation byte
                # check we're expecting one
                if (not $remaining) {
                    die "Unexpected continuation byte.\n";
                }
                my $r = $remaining - 1;
                $result .= "This is continuation byte $count, expecting $r more.\n";
                # strip off the high bit
                $_ &=~ 0b10000000;
            } else {
                # check we're not expecting more continuation bytes
                if ($remaining) {
                    $result .= "Previous UTF-8 multibyte sequence incomplete, earlier bytes dropped.\n";
                }
                # count how many leading bits are on
                my $bit = 7;
                $remaining = 0;
                $count = 0;
                $scratch = 0;
                while (($bit >= 0) and ($_ & (1 << $bit)) > 0) {
                    # one more byte expected
                    ++$remaining;
                    # turn off the bit
                    $_ &=~ (1 << $bit);
                    # ready for next bit
                    --$bit;
                }
                # $remaining must be 0, 2, 3, 4, 5, or 6
                $remaining = 1 if $remaining == 0;
                if ($remaining < 1 or $remaining > 6) {
                    die "Not a valid UTF-8 byte (internal error during decoding: remaining == $remaining).\n";
                }
                if ($remaining > 1) {
                    $result .= "This is the first byte of a $remaining byte sequence.\n";
                }
            }
            # add the current byte to the pending number
            --$remaining;
            # if ($count == 1 and not $_ and not $scratch) {
                # XXX need a reliable way of telling that this is an overlong byte sequence     
                # note that U+1047E produces a first byte which is (11110)(000), so you need to check more than one byte
                # but note that 0b11100000 0b10000001 0b10000001 is overlong
                # $result .= "Warning: This is an overlong multibyte sequence.\n"
            # }
            ++$count;
            $scratch += $_ << (6 * $remaining);
            if (not $remaining) {
                $entities .= '&#x' . sprintf('%04x', $scratch) . ';';
                $scratch = sprintf('%04X', $scratch);
                $result .= "\n";
                addName($scratch, \$result, \$names);
                $result .= "\n";
            }
        };
        if ($@) {
            $result .= $@;
            if ($remaining) {
                $result .= "Premature end of multibyte sequence, some bytes dropped\n";
                $remaining = 0;
            }
        }
    }
    if ($remaining) {
        $result .= "End of file during multibyte sequence, some bytes dropped\n";
    }
    if (not length $entities) {
        $entities = '(none)';
        $names = '(none)';
    }
    $result = encode_entities($result);
    $names = encode_entities($names);
    my $escapedEntities = encode_entities($entities);
    print <<end;
Content-Type: text/html;charset=utf-8

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
<html lang="en">
 <head>
  <title>utf8-decoder: Results</title>
  <style type="text/css">
   pre { margin-bottom: 1em; padding-bottom: 1em; border-bottom: solid thin; }
  </style>
 </head>
 <body>
  <p>As character names:</p>
  <pre>$names</pre>
  <p>As raw characters:</p>
  <pre>$entities</pre>
  <p>As a string of HTML entities:</p>
  <pre>$escapedEntities</pre>
  <p>Decoder output:</p>
  <pre>$result</pre>
 </body>
</html>
end
} elsif (length($bytes)) {
    print <<end;
Content-Type: text/html;charset=utf-8

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
<html lang="en">
 <head>
  <title>utf8-decoder: Results</title>
 </head>
 <body>
  <p>You did not include any bytes. Make sure you selected the right
  submission type.</p>
 </body>
</html>
end
} else {
    print <<end;
Content-Type: text/html;charset=windows-1252

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
<html lang="en">
 <head>
  <title>utf8-decoder</title>
  <style type="text/css">
    textarea { display: block; }
    dt { font-weight: bolder; margin: 1em 0 0 0; }
  </style>
 </head>
 <body>
  <form action="utf8-decoder" method="post" enctype="multipart/form-data" accept-charset="windows-1252,iso-8859-1">
   <h1>UTF-8 Decoder</h1>
   <p>
    <legend>
     Input type:
     <select name="encoding">
      <option value="embedded" selected="selected">Embedded</option>
      <option value="numeric">Freeform numeric</option>
      <option value="hex">Hexadecimal</option>
      <option value="binary">Binary</option>
      <option value="windows1252">UTF-8 interpreted as Windows-1252</option>
     </select>
    </legend>
   </p> 
   <p>
    <legend>
     Enter your bytes:
     <textarea name="bytes" cols="80" rows="10"></textarea>
    </legend>
   </p>
   <p><strong>Note:</strong> Non-numeric characters in "freeform
   numeric" and "hexadecimal" modes are silently stripped. In "binary"
   mode, bytes must be separated from each by spaces, tabs, or
   newlines; other characters are stripped too. If you are getting
   strange results, double check that you selected the right mode.</p>
   <p>
    <input type="submit" value="Decode">
   </p>
  </form>
  <p>You can enter bytes in any of the following forms:</p>
  <dl>
   <dt>Embedded</dt>
   <dd>Raw ASCII text with UTF-8 encoded characters represented by backslash escapes:
    <ul>
     <li>Hexadecimal: \\x12 \\x34 \\x56 \\x78</li>
     <li>Decimal: \\d123 \\d45 \\d67</li>
     <li>Octal: \\123 \\45</li>
     <li>Binary: \\b01010101 \\b10101010 \\b11110000</li>
    </ul>
   </dd>
   <dt>Freeform numeric</dt>
   <dd>Space separated bytes in one or more of the following numeric forms:
    <ul>
     <li>Hexadecimal: 0x12 0x34 0x56 0x78 or x12 x34 x56 x78</li>
     <li>Decimal: 123 45 67</li>
     <li>Octal, using either backslashes or leading 0s: \\123 \\45 or 0123 045</li>
     <li>Binary: 0b01010101 0b10101010 0b11110000</li>
    </ul>
   </dd>
   <dt>Hexadecimal</dt>
   <dd>A hex dump. Bytes are extracted two characters at a time, with
   all whitespace ignored, e.g. 123456789ABCDE.</dd>
   <dt>Binary</dt>
   <dd>Space-separated list of binary numbers, e.g. 01010101 10101010 11110000</dd>
   <dt>UTF-8 interpreted as Windows-1252</dt>
   <dd>Raw UTF-8 encoded text, but interpreted as Windows-1252. For
   example, if your source viewer only supports Windows-1252, but the
   page is encoded as UTF-8, you can select text from your source
   viewer, paste it here, and see what the characters really are.</dd>
  </dl>
 </body>
</html>
end
}