使用DataScraper抽取的Web数据中可能含有一些XML不允许的字符,而Web数据抽取结果文件是XML格式的,为了保证XML解析器正确地解析结果文件,需要在解析之前先过滤掉非法字符。下面的一段Java程序代码将编码为0x01~0x08、0x0B~0x0C、0x0E~0x1F、0x7F~0x84、0x86~0x9F的字符用空格替代,同时将编码段0x01~0xD7FF、0xE000~0xFFFD、0x10000~0x10FFFF外的字符也用空格替代。该代码段是一个Java类的一个私有方法,仅作参考,其中可能含有bug,而且没有考虑性能优化,使用时请慎重。
private void filterHarvestFile(File file) {
InputStreamReader fReader = null;
OutputStreamWriter fWriter = null;
String fileName = file.getName();
try {
StringBuffer strBuffer = new StringBuffer();
fReader = new InputStreamReader(new FileInputStream(file), "UTF-8");
char[] charBuf = new char[2048];
int len = fReader.read(charBuf, 0, 2048);
while(len > 0) {
strBuffer.append(charBuf, 0, len);
len = fReader.read(charBuf, 0, 2048);
}
fReader.close();
fReader = null;
for(int i = 0; i < strBuffer.length(); i ++ ) {
int code = strBuffer.codePointAt(i);
if((code >= 0x1 && code <= 0x8) ||
(code >= 0xB && code <= 0xC) ||
(code >= 0xE && code <= 0x1F) ||
(code >= 0x7F && code <= 0x84) ||
(code >= 0x86 && code <= 0x9F)) {
strBuffer.setCharAt(i, ' ');
logger.info("Invalid character " + code + " has been filtered out from the file " + fileName);
}else if((code >= 0x1 && code <= 0xD7FF) ||
(code >= 0xE000 && code <= 0xFFFD) ||
(code >= 0x10000 && code <= 0x10FFFF)) {
continue;
}else {
strBuffer.setCharAt(i, ' ');
logger.info("Invalid character " + code + " has been filtered out from the file " + fileName);
}
}
fWriter = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
fWriter.write(strBuffer.toString());
fWriter.flush();
fWriter.close();
fWriter = null;
}catch(Exception e) {
logger.error("...");
}finally {
if(fReader != null) {
try {
fReader.close();
}catch(Exception ee) {}
}
if(fWriter != null) {
try {
fWriter.close();
}catch(Exception ee) {}
}
}
}