通用的读取方法:
读取 doc
private static String contextOfDoc(File file){ String str = ""; try { FileInputStream fis = new FileInputStream(file); HWPFDocument doc = new HWPFDocument(fis); str = doc.getDocumentText(); doc.close(); fis.close(); } catch (Exception e) { e.printStackTrace(); } return str; }
读取 docx
FileInputStream fis = null; XWPFDocument xdoc = null; XWPFWordExtractor extractor = null; try{ if (suffix.endsWith(".docx")) { fis = new FileInputStream(file); xdoc = new XWPFDocument(fis); extractor = new XWPFWordExtractor(xdoc); wordText = extractor.getText(); } } catch (IOException e) { log.error("getWordContent error", e); } finally { try { if (extractor != null){ extractor.close(); } } catch (IOException e) { log.error("close stream failed", e); } CloseUtil.closeStream(fis); } //将整个文档数据字符串拆分成行数据,删除两头空格,并删除空行 String[] lineArr = wordText.split("\r\n|\n\n|\n"); for (String line : lineArr) { if (StringUtils.isNotEmpty(line.trim())) { lineList.add(line.trim()); } }