过滤Web数据抽取结果文件的Java程序

使用DataScraper抽取的Web数据中可能含有一些XML不允许的字符,而Web数据抽取结果文件是XML格式的,为了保证XML解析器正确地解析结果文件,需要在解析之前先过滤掉非法字符。下面的一段Java程序代码将编码为0x01~0x08、0x0B~0x0C、0x0E~0x1F、0x7F~0x84、0x86~0x9F的字符用空格替代,同时将编码段0x01~0xD7FF、0xE000~0xFFFD、0x10000~0x10FFFF外的字符也用空格替代。该代码段是一个Java类的一个私有方法,仅作参考,其中可能含有bug,而且没有考虑性能优化,使用时请慎重。


   private void filterHarvestFile(File file) {
       InputStreamReader fReader = null;
       OutputStreamWriter fWriter = null;
       String fileName = file.getName();
       try {
           StringBuffer strBuffer = new StringBuffer();
           fReader = new InputStreamReader(new FileInputStream(file), "UTF-8");
           char[] charBuf = new char[2048];
           int len = fReader.read(charBuf, 0, 2048);
           while(len > 0) {
               strBuffer.append(charBuf, 0, len);
               len = fReader.read(charBuf, 0, 2048);
           }
           fReader.close();
           fReader = null;
           for(int i = 0; i < strBuffer.length(); i ++ ) {
               int code = strBuffer.codePointAt(i);
               if((code >= 0x1 && code <= 0x8) ||
                 (code >= 0xB && code <= 0xC) ||
                 (code >= 0xE && code <= 0x1F) ||
                 (code >= 0x7F && code <= 0x84) ||
                 (code >= 0x86 && code <= 0x9F)) {
                   strBuffer.setCharAt(i, ' ');
                   logger.info("Invalid character " + code + " has been filtered out from the file " + fileName);
               }else if((code >= 0x1 && code <= 0xD7FF) ||
                 (code >= 0xE000 && code <= 0xFFFD) ||
                 (code >= 0x10000 && code <= 0x10FFFF)) {
                   continue;
               }else {
                   strBuffer.setCharAt(i, ' ');
                   logger.info("Invalid character " + code + " has been filtered out from the file " + fileName);
               }
           }
           fWriter = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
           fWriter.write(strBuffer.toString());
           fWriter.flush();
           fWriter.close();
           fWriter = null;
       }catch(Exception e) {
           logger.error("...");
       }finally {
           if(fReader != null) {
               try {
                   fReader.close();
               }catch(Exception ee) {}
           }
           if(fWriter != null) {
               try {
                   fWriter.close();
               }catch(Exception ee) {}
           }
       }
   }