Appendix:Baxter-Sagart Old Chinese reconstruction/BaxterSagartWikitableBuilder.java

// To compile: javac -encoding UTF-8 BaxterSagartWikitableBuilder.java // To run: java BaxterSagartWikitableBuilder // Output: Baxter-Sagart wikitable.txt

import java.io.*; import java.net.*; import java.util.*; import java.util.zip.*;

public class BaxterSagartWikitableBuilder {

public static final String baxterSagartURL = "http://crlao.ehess.fr/docannexe.php?id=1221";

public static final String unihanURL = "http://www.unicode.org/Public/UNIDATA/Unihan.zip";

public static final String outFile = "Baxter-Sagart wikitable.txt";

public static void main(String[] args) { try { main; } catch (Exception e) { e.printStackTrace; System.exit(1); } System.exit(0); }

public static long fetchDate; public static File baxterSagartFile; public static File unihanFile; public static TreeMap scMap; public static PrintWriter writer;

public static void main throws Exception {

// Remembering the exact time we fetched the online data. fetchDate = System.currentTimeMillis;

// Retrieving Baxter-Sagart data. baxterSagartFile = download(baxterSagartURL);

// Retrieving Unihan data. unihanFile = download(unihanURL);

// Processing Unihan data. // We need this for mappings of		// Traditional Chinese characters to // Simplified Chinese characters. processUnihan;

// Beginning to write out wikitable file. writer = new PrintWriter(			new BufferedWriter( new OutputStreamWriter(					new BufferedOutputStream( new FileOutputStream(outFile) ), "UTF-8"				) )		);

writer.print( "== Data ==\n" + //":This section is software-generated. The program's Java source code is here.\n" + "This table incorporates data from:\n" + "* The Unihan Database.\n" + "* Baxter, W. and " + "L. Sagart (n.d.) " + "Baxter-Sagart Old Chinese reconstruction (Version 1.00).  " + "Online at http://crlao.ehess.fr/document.php?id=1217 .  Accessed "		);

// Printing the fetch date. // We don't care about deprecated API. This works well enough. writer.print(new Date(fetchDate).toGMTString);

writer.print( ".\n" + "Legend of table headers:\n" + "* TC: character.\n" + "* SC:  character.\n" + "* PY: Mandarin  romanization.\n" + "* MC:  reconstruction.\n" + "* MCI: Middle Chinese initial.\n" + "* MCF: Middle Chinese final.\n" + "* MCT: Middle Chinese tone.\n" + "** A = even tone (平聲).\n" + "** B = rising tone (上聲).\n" + "** C = departing tone (去聲).\n" + "** D = entering tone (入聲).\n" + "* OC:  reconstruction.\n" + "* Gloss: Word's meaning.\n" + "{| class=\"wikitable sortable\"\n" + "|-\n" + "! TC\n" + "! SC\n" + "! PY\n" + "! MC\n" + "! MCI\n" + "! MCF\n" + "! MCT\n" + "! OC\n" + "! Gloss\n"		);

// Processing Baxter-Sagart data, and writing to file. processBaxterSagart;

// Close the wikitable. writer.print("|}\n");

// And we're done. writer.flush; writer.close;

}

public static File download(String url) throws Exception {

File file; InputStream in; OutputStream out; byte[] bytes; int read;

// Create the temp file. // We don't care where it's stored or what its name is. file = File.createTempFile("" + url.hashCode, null);

// The file will be deleted when execution finishes. file.deleteOnExit;

System.out.println("Downloading: " + url); System.out.println("This may take a while...");

// Opening an HTTP connection and securing an input stream. in = new URL(url).openStream;

// Buffering the input stream, if not already buffered. if (!in.markSupported) in = new BufferedInputStream(in);

// Opening an output stream to the temp file. out = new FileOutputStream(file);

// Buffering the output stream. out = new BufferedOutputStream(out);

// 4K read/write buffer. bytes = new byte[0x1000];

// Read/write loop. for { read = in.read(bytes); if (read < 0) // EOF break; if (read > 0) // Have some data. out.write(bytes, 0, read); }

// Closing input stream. in.close;

// Flushing and closing output stream. out.flush; out.close;

return file;

}

public static void processUnihan throws Exception {

ZipInputStream zin; ZipEntry entry; String filename; InputStream in; BufferedReader reader; String line; String[] tokens; String fieldType; String traditional; String simplified;

// Create traditional-to-simplified map data structure. scMap = new TreeMap;

// Opening zip file. zin = new ZipInputStream(			new BufferedInputStream( new FileInputStream(unihanFile) )		);

// Searching for the right zip entry. for {

entry = zin.getNextEntry;

if (entry == null) { zin.close; throw new RuntimeException(					"Can't find Unihan_Variants.txt."); }

filename = entry.getName;

if (filename.endsWith("Unihan_Variants.txt")) { // We found what we're looking for. break; }

// This isn't the zip entry we're looking for. entry = null; zin.closeEntry;

}

// We don't need this anymore. entry = null;

in = zin;

// Buffering the entry's input stream, if not already buffered. if (!in.markSupported) in = new BufferedInputStream(in);

// Creating a UTF-8 input stream reader. reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));

// Looping through the data, gleaning only what we need. for {

// Reading a line of text. line = reader.readLine;

if (line == null) // EOF break;

// Stripping comments from the line. line = line.replaceFirst("#.*$", "");

// Stripping trailing whitespace from the line. line = line.trim;

// Skipping empty lines. if (line.length == 0) continue;

// Split line by tab characters. tokens = line.split("\t");

// There should be at least three tokens. if (tokens.length < 3) continue; // Skip the line.

// Determine if this line has data we're looking for. fieldType = tokens[1].trim; if (!fieldType.equalsIgnoreCase("kSimplifiedVariant")) continue; // Skip the line.

// Traditional and simplified Chinese characters. traditional = fromUnicodeNotation(tokens[0]); simplified = fromUnicodeNotation(tokens[2]);

// If by chance they are the same, skip them. if (traditional.equals(simplified)) continue; // Skip the line.

// We found something we're looking for. scMap.put(traditional, simplified);

}

// We're done with the zip file. zin.close; in = null; zin = null;

// Rebranch the finished map for improved access speed. scMap = new TreeMap(scMap);

}

public static void processBaxterSagart throws Exception {

BufferedReader reader; boolean firstLine; String line; String[] tokens;

// Reading Baxter-Sagart database from temp file. reader = new BufferedReader(			new InputStreamReader( new BufferedInputStream(					new FileInputStream(baxterSagartFile)				), "UTF-8" )		);

// We will skip the first non-empty line when we reach it. firstLine = true;

// Looping through each line. for {

// Reading a line of text. line = reader.readLine;

if (line == null) // EOF break;

// Trimming trailing whitespace. line = line.trim;

// Skipping empty lines. if (line.length == 0) continue;

// Skipping the first line, which is a table header. if (firstLine) { firstLine = false; continue; }

// Split line by tab characters. tokens = line.split("[\\s^\t]*\t\\s*");

// There should be at least eleven tokens. if (tokens.length < 11) continue;

// Process tokens in another function. processBaxterSagart(tokens[0], tokens[1], tokens[2],				tokens[3], tokens[4], tokens[5], tokens[6],				tokens[7], tokens[8], tokens[9], tokens[10]);

}

// Closing the reader. reader.close;

}

public static void processBaxterSagart(		String tc, String py, String py2,		String mc, String mci, String mcf, String mct,		String oc, String gloss, String gst, String utf16	) throws Exception {

String s;

// Begin the new table row. writer.print("|-\n");

// Traditional Chinese character.

writer.print('|'); if (tc.length > 0) { writer.print("lang=zh-Hant|");			printEscaped(tc);			writer.print(""); } writer.print('\n');

// Simplified Chinese character. writer.print('|'); if (tc.length > 0) { s = scMap.get(tc); if (s == null || s.length == 0) s = tc; writer.print("lang=zh-Hans|");			printEscaped(s);			writer.print(""); } writer.print('\n');

// Pinyin, sortable. writer.print('|'); if (py.length > 0) { py2 = pinyinToSortable(py); if (!py.equals(py2)) { writer.print(" "); printEscaped(py2); writer.print(" "); } writer.print("");			printEscaped(py);			writer.print(""); } writer.print('\n');

// Middle Chinese, sortable. writer.print('|'); if (mc.length > 0) { mc = middleChineseToUnicode(mc); s = middleChineseToSortable(mc); if (!mc.equals(s)) { writer.print(" "); printEscaped(s); writer.print(" "); } printEscaped(mc); } writer.print('\n');

if (mcf.startsWith("-r")) { // misplaced mcf = "-" + mcf.substring(2); mci = mci.substring(0, mci.length - 1) + "r-"; }

// Middle Chinese initial, sortable. writer.print('|'); if (mci.length > 0) { mci = middleChineseToUnicode(mci); s = middleChineseToSortable(mci); if (!mci.equals(s)) { writer.print(" "); printEscaped(s); writer.print(" "); } printEscaped(mci); } writer.print('\n');

// Middle Chinese final, sortable. writer.print('|'); if (mcf.length > 0) { mcf = middleChineseToUnicode(mcf); s = middleChineseToSortable(mcf); if (!mcf.equals(s)) { writer.print(" "); printEscaped(s); writer.print(" "); } printEscaped(mcf); } writer.print('\n');

// Middle Chinese tone, sortable. writer.print('|'); if (mct.length > 0) { switch (mct.charAt(0)) { case 'A': writer.print(" A even"); break; case 'B': writer.print(" B rising"); break; case 'C': writer.print(" C departing"); break; case 'D': writer.print(" D entering"); break; default: printEscaped(mct); }		} writer.print('\n');

// Old Chinese, semi-sortable. writer.print('|'); if (oc.length > 0) { writer.print("class=IPA|"); oc = oldChineseToUnicode(oc); s = oldChineseToSortable(oc); if (!oc.equals(s)) { writer.print(" "); printEscaped(s); writer.print(" "); } printEscaped(oc); } writer.print('\n');

// Gloss, semi-sortable writer.print('|'); if (gloss.length > 0) printEscaped(gloss); writer.print('\n');

}

public static String pinyinToSortable(String string) { string = string.replaceFirst("([āēīōūǖ].*)$", "$1"+"1"); string = string.replaceFirst("([áéíóúǘ].*)$", "$1"+"2"); string = string.replaceFirst("([ǎěǐǒǔǚ].*)$", "$1"+"3"); string = string.replaceFirst("([àèìòùǜ].*)$", "$1"+"4"); string = string.replaceFirst("[āáǎà]", "a"); string = string.replaceFirst("[ēéěè]", "e"); string = string.replaceFirst("[īíǐì]", "i"); string = string.replaceFirst("[ōóǒò]", "o"); string = string.replaceFirst("[ūúǔù]", "u"); string = string.replaceFirst("[üǖǘǚǜ]", "v"); return string; }

public static String middleChineseToUnicode(String string) {

// Converting ASCII-friendly version to Unicode. string = string.replace('\'', 'ʔ'); string = string.replace("ae", "æ"); string = string.replace("ea", "ɛ"); string = string.replace('+', 'ɨ'); return string;

}

public static String middleChineseToSortable(String string) {

// Dashes are not needed in sorting. string = string.replace("-", "");

// 'ʔ' < letters string = string.replace('ʔ', '\'');

// 'a' < 'æ' < 'b'		string = string.replace("æ", "a~");

// 'd' < 'ɛ' < 'e'		string = string.replace("e", "e~"); string = string.replace('ɛ', 'e');

// 'h' < 'ɨ' < 'i'		string = string.replace("i", "i~"); string = string.replace('ɨ', 'i');

// rising tone is second tone string = string.replace('X', '2');

// departing tone is third tone string = string.replace('H', '3');

return string;

}

public static String oldChineseToUnicode(String string) {

// Streamlining devoicing diacritics. string = string.replaceAll("[̥̊]+", "̥"); string = string.replace("ŋ̥", "ŋ̊");

// Streamlining pharyngealization diacritics. string = string.replace('ˤ', 'ˁ');

string = string.replaceAll("(\\*(\\[?[A-Za-z]ə?[\\.\\-])?\\[?)g", "$1ɡ");

return string;

}

public static String oldChineseToSortable(String string) {

string = string.replace('ɡ', 'g');

// Temporarily converting "ts" and "dz". string = string.replace("ts", "ʦ"); string = string.replace("dz", "ʣ");

// Making loosely-bound prefix schwas sort-neutral. string = string.replaceAll("ə([\\.\\-])", "$1");

// Stripping lots of sort-neutral stuff. string = string.replaceAll(			"[\\*\\-\\.\\<\\>\\(\\)\\[\\]\\{\\}]", "");

// space < 'C' < 'N' < 'ʔ' < letters string = string.replace('C', '$'); string = string.replace('N', '%'); string = string.replace('ʔ', '\'');

// 'd' < 'dz' < 'ə' < 'e'		string = string.replace("ʣ", "d~"); string = string.replace("e", "e~"); string = string.replace('ə', 'e');

// 'g' < 'ɢ' < 'h'		string = string.replace("ɢ", "g~");

// 'l' < 'l̥' < 'm' < 'm̥' < 'n' < 'n̥' < 'ŋ' < 'ŋ̊' < 'o'		// 'r' < 'r̥' < 's'		string = string.replaceAll("[̥̊]+", "~"); string = string.replace("ŋ", "n");

// 't' < 'ts' < 'u'		string = string.replace("ʦ", "t~");

// letters < 'ˁ' < 'ʰ' < 'ʷ' string = string.replace("ˁ", "z"); string = string.replace("ʰ", "z~"); string = string.replace("ʷ", "z");

return string;

}

public static String fromUnicodeNotation(String string) throws Exception {

int code; StringBuilder builder;

// Stripping everything after a certain point. string = string.replaceFirst("[ <].*$", "");

// Stripping all non-hexadecimal characters. string = string.replaceAll("[^0-9A-Fa-f]", "");

// Parsing hexadecimal number. code = Integer.parseInt(string, 16);

// Converting the code point to a string and returning it. builder = new StringBuilder(4); try { builder.appendCodePoint(code); } catch (IllegalArgumentException e) { System.out.println(string + ", " + code); throw e; } return builder.toString.intern;

}

public static void printEscaped(String string) {

int length, index; String substit; char ch;

length = string.length;

for (index = 0; index < length; index++) {

substit = null; ch = string.charAt(index);

switch (ch) { case '&': substit = "&amp;"; break; case '<': substit = "‹"; break; case '>': substit = "›"; break; case '\"': substit = "&quot;"; break;			case '\: substit = "&#" + (int)'\ + ";"; break;			case '[': substit = "&#" + (int)'[' + ";"; break;			case ']': substit = "&#" + (int)']' + ";"; break;			case '{': substit = "&#" + (int)'{' + ";"; break;			case '}': substit = "&#" + (int)'}' + ";"; break;			case '|': substit = "&#" + (int)'|' + ";"; break;			}

if (substit != null) writer.print(substit); else	writer.print(ch);

}

}

}