Friday, March 11, 2011

Character transcoding

import java.io.*;
import java.net.URL;
import org.xml.sax.InputSource;
import com.sun.xml.parser.Resolver;
import com.sun.xml.util.XmlChars;

public class main
{
//
// how many characters of pushback to accomodate when scanning
// the XML declaration -- enough for the entire XML (or text)
// declaration, except when excessive whitespace is used.
//
static final int PUSHBACK_SIZE = 1024;

//
// Simple demonstration of character set transcoding, where a
// document in one character set is converted into one in
// another character set.
//
// Argument 0: encoding to use for output
// Argument 1: file or URI to transcode
//
// Output is on System.out, in the specified encoding. This
// must be a Java encoding name; unfortunately at this time
// the IANA standard encoding names aren't all accepted, even
// when the encoding is itself supported. (JDK 1.2 is better
// than JDK 1.1 in this respect.)
//
// NOTE: JDK 1.1 and JDK 1.2 do not expose the capability to
// cause an I/O exception when writing characters which can't
// be expressed in the output encoding. This means that the
// transcoding performed by this program may cause SILENT (!!)
// errors; the non-encodable characters will be rendered as
// ASCII question marks ("?") in the output. (This should get
// fixed in a future java release.)
//
// For example, attempting to transcode a document with Chinese,
// Japanese, Korean, or Vietnamese text into US-ASCII will as
// a rule produce lots of "????" text rather than reporting any
// error in the transcoding. However, EUC-JP can be readily
// transcoded to ISO-2022-JP or UTF-8.
//
// All input encodings can be transcoded to UTF-8 or UTF-16.
//
public static void main (String argv [])
{
if (argv.length != 2) {
System.err.println ("Usage: transcode encoding [file|uri]");
System.exit (1);
}

try {
InputSource inSource;
PushbackReader in;
Writer out;

out = new OutputStreamWriter (System.out, argv [0]);
inSource = createInputSource (argv [1]);
in = new PushbackReader (inSource.getCharacterStream (),
PUSHBACK_SIZE);

// Write the XML declaration, optionally followed
// by a new end-of-line if there was none already...
out.write ("");
else
out.write ("?>"
+ System.getProperty ("line.separator"));

// ... then transcode everything (modulo the errors
// that are silently swallowed by java.io)
char buf [] = new char [16 * 1024];
int len;

for (;;) {
len = in.read (buf, 0, buf.length);
if (len < 0) break; out.write (buf, 0, len); } out.close (); } catch (Throwable t) { t.printStackTrace (System.err); } System.exit (0); } static private InputSource createInputSource (String fileOrUrl) throws IOException { File f = new File (fileOrUrl); if (f.exists ()) return Resolver.createInputSource (f); return Resolver.createInputSource (new URL (fileOrUrl), true); } // returns true if saw an XML decl -- avoid adding extra whitespace static private boolean maybeStandaloneDecl ( PushbackReader in, Writer out ) throws IOException { char xmlDecl [] = new char [PUSHBACK_SIZE]; int len = in.read (xmlDecl, 0, xmlDecl.length); for (;;) { // // Must start with ""
if (xmlDecl [cursor] == '?')
break;

// only "standalone=" starts with 's'
boolean isStandalone = (xmlDecl [cursor++] == 's');

// ... all values are singly or doubly quoted
while (!(xmlDecl [cursor] == ''' || xmlDecl [cursor] == '"')
&& cursor < len) cursor++; cursor++; // ... 'standalone="no"' can be dropped; only // the 'yes' needs to be repeated... if (isStandalone && xmlDecl [cursor] == 'y') out.write (" standalone="yes""); // ... skip to the terminating quote while (!(xmlDecl [cursor] == ''' || xmlDecl [cursor] == '"') && cursor < len) cursor++; // continue till last "attribute" // XXX note: extremely long XML declarations won't all // be buffered ... this happens when unhealthy amounts // of whitespace get used. In such cases this routine // will fail by not skipping through the terminal "?>".
}
cursor += 2;
in.unread (xmlDecl, cursor, len - cursor);
return true;
}
in.unread (xmlDecl, 0, len);
return false;
}
}

No comments:

Post a Comment