-
Notifications
You must be signed in to change notification settings - Fork 281
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Arabic support
- Loading branch information
Showing
6 changed files
with
207 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# Download icu4j source code, build using ant, | ||
# it will generate icu4j.jar and icu4j-charset.jar | ||
# Run slim-icu.py to generate slim version | ||
# Currently this script will only keep Arabic and English data | ||
|
||
# slim ICU | ||
import sys | ||
import os | ||
from pathlib import Path | ||
import zipfile | ||
from zipfile import ZipFile | ||
|
||
directory = str(Path(__file__).resolve().parent) | ||
if len(sys.argv) > 1: | ||
directory = sys.argv[1] | ||
|
||
mode = zipfile.ZIP_DEFLATED | ||
|
||
|
||
def keep_file(filename): | ||
# skip all break iterators | ||
if filename.endswith(".brk") \ | ||
or filename.endswith(".dict") \ | ||
or filename.endswith("unames.icu") \ | ||
or filename.endswith("ucadata.icu") \ | ||
or filename.endswith(".spp"): | ||
return False | ||
|
||
# keep english and arabic | ||
if filename.startswith("en") \ | ||
or filename.startswith("ar") \ | ||
or not filename.endswith(".res"): | ||
return True | ||
|
||
return False | ||
|
||
|
||
zin = ZipFile(os.path.join(directory, 'icu4j.jar'), 'r') | ||
zout = ZipFile(os.path.join(directory, 'icu4j-slim.jar'), 'w', mode) | ||
|
||
for item in zin.infolist(): | ||
buff = zin.read(item.filename) | ||
print(item.filename) | ||
|
||
if keep_file(item.filename): | ||
print("Keep") | ||
zout.writestr(item, buff) | ||
else: | ||
print("Remove") | ||
|
||
zout.close() | ||
zin.close() | ||
|
||
|
||
def keep_charset_file(filename): | ||
to_remove = [ | ||
"cns-11643-1992.cnv", | ||
"ebcdic-xml-us.cnv", | ||
"euc-jp-2007.cnv", | ||
"euc-tw-2014.cnv", | ||
"gb18030.cnv", | ||
"ibm-1363_P11B-1998.cnv", | ||
"ibm-1364_P110-2007.cnv", | ||
"ibm-1371_P100-1999.cnv", | ||
"ibm-1373_P100-2002.cnv", | ||
"ibm-1375_P100-2008.cnv", | ||
"ibm-1383_P110-1999.cnv", | ||
"ibm-1386_P100-2001.cnv", | ||
"ibm-1388_P103-2001.cnv", | ||
"ibm-1390_P110-2003.cnv" | ||
] | ||
|
||
for i in to_remove: | ||
if i in filename: | ||
return False | ||
|
||
return True | ||
|
||
|
||
zin = ZipFile(os.path.join(directory, 'icu4j-charset.jar'), 'r') | ||
zout = ZipFile(os.path.join(directory, 'icu4j-charset-slim.jar'), 'w', mode) | ||
|
||
for item in zin.infolist(): | ||
buff = zin.read(item.filename) | ||
print(item.filename, end=' ') | ||
|
||
if keep_charset_file(item.filename): | ||
print("Keep") | ||
zout.writestr(item, buff) | ||
else: | ||
print("Remove") | ||
|
||
zout.close() | ||
zin.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package qz.utils; | ||
|
||
import com.ibm.icu.charset.CharsetEncoderICU; | ||
import com.ibm.icu.charset.CharsetProviderICU; | ||
import com.ibm.icu.text.ArabicShaping; | ||
import com.ibm.icu.text.ArabicShapingException; | ||
import com.ibm.icu.text.Bidi; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.nio.CharBuffer; | ||
import java.nio.charset.CharacterCodingException; | ||
import java.nio.charset.Charset; | ||
import java.nio.charset.CoderResult; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
/** | ||
* Created by Yohanes Nugroho on 7/10/2018. | ||
*/ | ||
public class ArabicConversionUtilities { | ||
|
||
/** | ||
* This is the simplest and most reliable method: | ||
* If all characters on input string does not contain any Arabic letters then return it as it is, | ||
* otherwise do special Arabic text conversion | ||
* <p> | ||
* To send data to printer, we need to split the commands from the text, eg:<br/> | ||
* {@code var data = ['\x1b\x41\x42', "Arabic text to print", '\x1b\x42x53', "Other texts"]} | ||
* | ||
* @param escp_or_text a String that contains only ESC/P code or only text | ||
* @return encoded bytes | ||
*/ | ||
public static byte[] convertToIBM864(String escp_or_text) throws CharacterCodingException, ArabicShapingException { | ||
boolean allAscii = true; | ||
for(int i = 0; i < escp_or_text.length(); i++) { | ||
//https://wiki.sei.cmu.edu/confluence/display/java/STR01-J.+Do+not+assume+that+a+Java+char+fully+represents+a+Unicode+code+point | ||
int ch = escp_or_text.codePointAt(i); | ||
if (ch > 255) { | ||
allAscii = false; | ||
} | ||
} | ||
|
||
if (allAscii) { | ||
//we use 'ISO-8859-1' that will map bytes as it is | ||
return escp_or_text.getBytes(StandardCharsets.ISO_8859_1); | ||
} else { | ||
//Layout the characters from logical order to visual ordering | ||
Bidi para = new Bidi(); | ||
para.setPara(escp_or_text, Bidi.LEVEL_DEFAULT_LTR, null); | ||
String data = para.writeReordered(Bidi.DO_MIRRORING); | ||
return convertVisualOrderedToIBM864(data); | ||
} | ||
} | ||
|
||
/** | ||
* Shape a visual ordered Arabic string and then encode it in IBM864 encoding | ||
* | ||
* @param str input string | ||
* @return encoded bytes | ||
*/ | ||
private static byte[] convertVisualOrderedToIBM864(String str) throws ArabicShapingException, CharacterCodingException { | ||
//We shape the characters to map it to Unicode in FExx range | ||
//Note that the output of Bidi is VISUAL_LTR, so we need the flag: ArabicShaping.TEXT_DIRECTION_VISUAL_LTR) | ||
ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_SHAPE | ArabicShaping.TEXT_DIRECTION_VISUAL_LTR | ArabicShaping.LENGTH_GROW_SHRINK); | ||
String shaped = as.shape(str); | ||
|
||
//then we need to convert it to IBM864 using ICU Encoder | ||
CharsetProviderICU icu = new CharsetProviderICU(); | ||
Charset cs = icu.charsetForName("IBM864"); | ||
CharsetEncoderICU icuc = (CharsetEncoderICU)cs.newEncoder(); | ||
|
||
//We need to use fallback for some character forms that can not be found | ||
icuc.setFallbackUsed(true); | ||
ByteBuffer output = ByteBuffer.allocate(shaped.length() * 2); | ||
CharBuffer inp = CharBuffer.wrap(shaped); | ||
CoderResult res = icuc.encode(inp, output, true); | ||
if (res.isError()) { | ||
res.throwException(); | ||
} | ||
|
||
int length = output.position(); | ||
byte all[] = output.array(); | ||
|
||
byte out[] = new byte[length]; | ||
System.arraycopy(all, 0, out, 0, length); | ||
|
||
return out; | ||
} | ||
|
||
} |