User:Oduarpa2108039132

// To compile: javac -encoding UTF-8 BaxterSagartWikitableBuilder.java // To run: java BaxterSagartWikitableBuilder // Output: Baxter-Sagart wikitable.txt

import java.io.*; import java.net.*; import java.util.*; import java.util.zip.*;

public class BaxterSagartWikitableBuilder {

public static final String baxterSagartURL =		"http://crlao.ehess.fr/docannexe.php?id=1221";

public static final String unihanURL =		"http://www.unicode.org/Public/UNIDATA/Unihan.zip";

public static final String outFile = "Baxter-Sagart wikitable.txt";

public static void main(String[] args) {		try { main; }		catch (Exception e) {			e.printStackTrace;			System.exit(1);		} System.exit(0);	}

public static long fetchDate;	public static File baxterSagartFile;	public static File unihanFile;	public static TreeMap scMap;	public static PrintWriter writer;

public static void main throws Exception {

// Remembering the exact time we fetched the online data. fetchDate = System.currentTimeMillis;

// Retrieving Baxter-Sagart data. baxterSagartFile = download(baxterSagartURL);

// Retrieving Unihan data. unihanFile = download(unihanURL);

// Processing Unihan data. // We need this for mappings of		// Traditional Chinese characters to		// Simplified Chinese characters. processUnihan;

// Beginning to write out wikitable file. writer = new PrintWriter(			new BufferedWriter(				new OutputStreamWriter(					new BufferedOutputStream(						new FileOutputStream(outFile)					), "UTF-8"				)			)		);

writer.print( "== Data ==\n" + //":This section is software-generated. The program's Java source code is here.\n" + "This table incorporates data from:\n" + "* The Unihan Database.\n" + "* Baxter, W. and " + "L. Sagart (n.d.) " + "Baxter-Sagart Old Chinese reconstruction (Version 1.00).  " + "Online at http://crlao.ehess.fr/document.php?id=1217 .  Accessed "		);

// Printing the fetch date. // We don't care about deprecated API. This works well enough. writer.print(new Date(fetchDate).toGMTString);

writer.print( ".\n" + "Legend of table headers:\n" + "* TC: character.\n" + "* SC:  character.\n" + "* PY: Mandarin  romanization.\n" + "* MC:  reconstruction.\n" + "* MCI: Middle Chinese initial.\n" + "* MCF: Middle Chinese final.\n" + "* MCT: Middle Chinese tone.\n" + "** A = even tone (平聲).\n" + "** B = rising tone (上聲).\n" + "** C = departing tone (去聲).\n" + "** D = entering tone (入聲).\n" + "* OC:  reconstruction.\n" + "* Gloss: Word's meaning.\n" + "{| class=\"wikitable sortable\"\n" + "|-\n" + "! TC\n" + "! SC\n" + "! PY\n" + "! MC\n" + "! MCI\n" + "! MCF\n" + "! MCT\n" + "! OC\n" + "! Gloss\n"		);

// Processing Baxter-Sagart data, and writing to file. processBaxterSagart;

// Close the wikitable. writer.print("|}\n");

// And we're done. writer.flush;		writer.close;

}

public static File download(String url) throws Exception {

File file;		InputStream in;		OutputStream out;		byte[] bytes;		int read;

// Create the temp file. // We don't care where it's stored or what its name is. file = File.createTempFile("" + url.hashCode, null);

// The file will be deleted when execution finishes. file.deleteOnExit;

System.out.println("Downloading: " + url);		System.out.println("This may take a while...");

// Opening an HTTP connection and securing an input stream. in = new URL(url).openStream;

// Buffering the input stream, if not already buffered. if (!in.markSupported)			in = new BufferedInputStream(in);

// Opening an output stream to the temp file. out = new FileOutputStream(file);

// Buffering the output stream. out = new BufferedOutputStream(out);

// 4K read/write buffer. bytes = new byte[0x1000];

// Read/write loop. for {			read = in.read(bytes);			if (read < 0) // EOF				break;			if (read > 0) // Have some data. out.write(bytes, 0, read);		}

// Closing input stream. in.close;

// Flushing and closing output stream. out.flush;		out.close;

return file;

}

public static void processUnihan throws Exception {

ZipInputStream zin;		ZipEntry entry;		String filename;		InputStream in;		BufferedReader reader;		String line;		String[] tokens;		String fieldType;		String traditional;		String simplified;

// Create traditional-to-simplified map data structure. scMap = new TreeMap;

// Opening zip file. zin = new ZipInputStream(			new BufferedInputStream(				new FileInputStream(unihanFile)			)		);

// Searching for the right zip entry. for {

entry = zin.getNextEntry;

if (entry == null) {				zin.close;				throw new RuntimeException(					"Can't find Unihan_Variants.txt.");			}

filename = entry.getName;

if (filename.endsWith("Unihan_Variants.txt")) {				// We found what we're looking for. break;			}

// This isn't the zip entry we're looking for. entry = null;			zin.closeEntry;

}

// We don't need this anymore. entry = null;

in = zin;

// Buffering the entry's input stream, if not already buffered. if (!in.markSupported)			in = new BufferedInputStream(in);

// Creating a UTF-8 input stream reader. reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));

// Looping through the data, gleaning only what we need. for {

// Reading a line of text. line = reader.readLine;

if (line == null) // EOF				break;

// Stripping comments from the line. line = line.replaceFirst("#.*$", "");

// Stripping trailing whitespace from the line. line = line.trim;

// Skipping empty lines. if (line.length == 0)				continue;

// Split line by tab characters. tokens = line.split("\t");

// There should be at least three tokens. if (tokens.length < 3)				continue; // Skip the line.

// Determine if this line has data we're looking for. fieldType = tokens[1].trim;			if (!fieldType.equalsIgnoreCase("kSimplifiedVariant"))				continue; // Skip the line.

// Traditional and simplified Chinese characters. traditional = fromUnicodeNotation(tokens[0]);			simplified = fromUnicodeNotation(tokens[2]);

// If by chance they are the same, skip them. if (traditional.equals(simplified))				continue; // Skip the line.

// We found something we're looking for. scMap.put(traditional, simplified);

}

// We're done with the zip file. zin.close;		in = null;		zin = null;

// Rebranch the finished map for improved access speed. scMap = new TreeMap(scMap);

}

public static void processBaxterSagart throws Exception {

BufferedReader reader;		boolean firstLine;		String line;		String[] tokens;

// Reading Baxter-Sagart database from temp file. reader = new BufferedReader(			new InputStreamReader(				new BufferedInputStream(					new FileInputStream(baxterSagartFile)				), "UTF-8"			)		);

// We will skip the first non-empty line when we reach it. firstLine = true;

// Looping through each line. for {

// Reading a line of text. line = reader.readLine;

if (line == null) // EOF				break;

// Trimming trailing whitespace. line = line.trim;

// Skipping empty lines. if (line.length == 0)				continue;

// Skipping the first line, which is a table header. if (firstLine) {				firstLine = false;				continue;			}

// Split line by tab characters. tokens = line.split("[\\s^\t]*\t\\s*");

// There should be at least eleven tokens. if (tokens.length < 11)				continue;

// Process tokens in another function. processBaxterSagart(tokens[0], tokens[1], tokens[2],				tokens[3], tokens[4], tokens[5], tokens[6],				tokens[7], tokens[8], tokens[9], tokens[10]);

}

// Closing the reader. reader.close;

}

public static void processBaxterSagart(		String tc, String py, String py2,		String mc, String mci, String mcf, String mct,		String oc, String gloss, String gst, String utf16	) throws Exception {

String s;

// Begin the new table row. writer.print("|-\n");

// Traditional Chinese character.

writer.print('|');		if (tc.length > 0) {			writer.print("lang=zh-Hant|");			printEscaped(tc);			writer.print("");		} writer.print('\n');

// Simplified Chinese character. writer.print('|');		if (tc.length > 0) {			s = scMap.get(tc);			if (s == null || s.length == 0)				s = tc;			writer.print("lang=zh-Hans|");			printEscaped(s);			writer.print("");		} writer.print('\n');

// Pinyin, sortable. writer.print('|');		if (py.length > 0) {			py2 = pinyinToSortable(py);			if (!py.equals(py2)) {				writer.print(" ");				printEscaped(py2);				writer.print(" ");			} writer.print("");			printEscaped(py);			writer.print("");		} writer.print('\n');

// Middle Chinese, sortable. writer.print('|');		if (mc.length > 0) {			mc = middleChineseToUnicode(mc);			s = middleChineseToSortable(mc);			if (!mc.equals(s)) {				writer.print(" ");				printEscaped(s);				writer.print(" ");			} printEscaped(mc);		} writer.print('\n');

if (mcf.startsWith("-r")) { // misplaced			mcf = "-" + mcf.substring(2);			mci = mci.substring(0, mci.length - 1) + "r-";		}

// Middle Chinese initial, sortable. writer.print('|');		if (mci.length > 0) {			mci = middleChineseToUnicode(mci);			s = middleChineseToSortable(mci);			if (!mci.equals(s)) {				writer.print(" ");				printEscaped(s);				writer.print(" ");			} printEscaped(mci);		} writer.print('\n');

// Middle Chinese final, sortable. writer.print('|');		if (mcf.length > 0) {			mcf = middleChineseToUnicode(mcf);			s = middleChineseToSortable(mcf);			if (!mcf.equals(s)) {				writer.print(" ");				printEscaped(s);				writer.print(" ");			} printEscaped(mcf);		} writer.print('\n');

// Middle Chinese tone, sortable. writer.print('|');		if (mct.length > 0) {			switch (mct.charAt(0)) {			case 'A':	writer.print(" A even"); break;			case 'B':	writer.print(" B rising"); break;			case 'C':	writer.print(" C departing"); break;			case 'D':	writer.print(" D entering"); break;			default: printEscaped(mct);			}		} writer.print('\n');

// Old Chinese, semi-sortable. writer.print('|');		if (oc.length > 0) {			writer.print("class=IPA|");			oc = oldChineseToUnicode(oc);			s = oldChineseToSortable(oc);			if (!oc.equals(s)) {				writer.print(" ");				printEscaped(s);				writer.print(" ");			} printEscaped(oc);		} writer.print('\n');

// Gloss, semi-sortable		writer.print('|');		if (gloss.length > 0)			printEscaped(gloss);		writer.print('\n');

}

public static String pinyinToSortable(String string) {		string = string.replaceFirst("([āēīōūǖ].*)$", "$1"+"1");		string = string.replaceFirst("([áéíóúǘ].*)$", "$1"+"2");		string = string.replaceFirst("([ǎěǐǒǔǚ].*)$", "$1"+"3");		string = string.replaceFirst("([àèìòùǜ].*)$", "$1"+"4");		string = string.replaceFirst("[āáǎà]", "a");		string = string.replaceFirst("[ēéěè]", "e");		string = string.replaceFirst("[īíǐì]", "i");		string = string.replaceFirst("[ōóǒò]", "o");		string = string.replaceFirst("[ūúǔù]", "u");		string = string.replaceFirst("[üǖǘǚǜ]", "v");		return string;	}

public static String middleChineseToUnicode(String string) {

// Converting ASCII-friendly version to Unicode. string = string.replace('\, 'ʔ');		string = string.replace("ae", "æ");		string = string.replace("ea", "ɛ");		string = string.replace('+', 'ɨ');		return string;

}

public static String middleChineseToSortable(String string) {

// Dashes are not needed in sorting. string = string.replace("-", "");

// 'ʔ' < letters		string = string.replace('ʔ', '\);

// 'a' < 'æ' < 'b'		string = string.replace("æ", "a~");

// 'd' < 'ɛ' < 'e'		string = string.replace("e", "e~");		string = string.replace('ɛ', 'e');

// 'h' < 'ɨ' < 'i'		string = string.replace("i", "i~");		string = string.replace('ɨ', 'i');

// rising tone is second tone		string = string.replace('X', '2');

// departing tone is third tone		string = string.replace('H', '3');

return string;

}

public static String oldChineseToUnicode(String string) {

// Streamlining devoicing diacritics. string = string.replaceAll("[̥̊]+", "̥");		string = string.replace("ŋ̥", "ŋ̊");

// Streamlining pharyngealization diacritics. string = string.replace('ˤ', 'ˁ');

string = string.replaceAll("(\\*(\\[?[A-Za-z]ə?[\\.\\-])?\\[?)g", "$1ɡ");

return string;

}

public static String oldChineseToSortable(String string) {

string = string.replace('ɡ', 'g');

// Temporarily converting "ts" and "dz". string = string.replace("ts", "ʦ");		string = string.replace("dz", "ʣ");

// Making loosely-bound prefix schwas sort-neutral. string = string.replaceAll("ə([\\.\\-])", "$1");

// Stripping lots of sort-neutral stuff. string = string.replaceAll(			"[\\*\\-\\.\\<\\>\\(\\)\\[\\]\\{\\}]", "");

// space < 'C' < 'N' < 'ʔ' < letters		string = string.replace('C', '$');		string = string.replace('N', '%');		string = string.replace('ʔ', '\);

// 'd' < 'dz' < 'ə' < 'e'		string = string.replace("ʣ", "d~");		string = string.replace("e", "e~");		string = string.replace('ə', 'e');

// 'g' < 'ɢ' < 'h'		string = string.replace("ɢ", "g~");

// 'l' < 'l̥' < 'm' < 'm̥' < 'n' < 'n̥' < 'ŋ' < 'ŋ̊' < 'o'		// 'r' < 'r̥' < 's'		string = string.replaceAll("[̥̊]+", "~");		string = string.replace("ŋ", "n");

// 't' < 'ts' < 'u'		string = string.replace("ʦ", "t~");

// letters < 'ˁ' < 'ʰ' < 'ʷ'		string = string.replace("ˁ", "z");		string = string.replace("ʰ", "z~");		string = string.replace("ʷ", "z");

return string;

}

public static String fromUnicodeNotation(String string)	throws Exception {

int code;		StringBuilder builder;

// Stripping everything after a certain point. string = string.replaceFirst("[ <].*$", "");

// Stripping all non-hexadecimal characters. string = string.replaceAll("[^0-9A-Fa-f]", "");

// Parsing hexadecimal number. code = Integer.parseInt(string, 16);

// Converting the code point to a string and returning it. builder = new StringBuilder(4);		try {		builder.appendCodePoint(code);		} catch (IllegalArgumentException e) { System.out.println(string + ", " + code); throw e; }		return builder.toString.intern;

}

public static void printEscaped(String string) {

int length, index;		String substit;		char ch;

length = string.length;

for (index = 0; index < length; index++) {

substit = null;			ch = string.charAt(index);

switch (ch) {			case '&': substit = "&"; break;			case '<': substit = "‹"; break;			case '>': substit = "›"; break;			case '\"': substit = """; break;			case '\: substit = "&#" + (int)'\ + ";"; break;			case '[': substit = "&#" + (int)'[' + ";"; break;			case ']': substit = "&#" + (int)']' + ";"; break;			case '{': substit = "&#" + (int)'{' + ";"; break;			case '}': substit = "&#" + (int)'}' + ";"; break;			case '|': substit = "&#" + (int)'|' + ";"; break;			}

if (substit != null)				writer.print(substit);			else	writer.print(ch);

}

}

}