1200字范文 > 使用POI提取Word文件的内容(纯文本带html格式)

使用POI提取Word文件的内容(纯文本带html格式)

时间：2020-04-17 16:49:01

使用poi提取Word文件的内容，区分带html和不带格式的

依赖jar导入pom.xml

<dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>3.17</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>3.17</version></dependency><dependency><groupId>fr.opensagres.xdocreport</groupId><artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId><version>2.0.1</version></dependency>

提取工具类：

import com.datahub.aimindgraph.exception.WordExtractorException;import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;import fr.opensagres.poi.xwpf.converter.core.FileURIResolver;import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.w3c.dom.Document;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.transform.OutputKeys;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import java.io.*;/*** @Desc word extraction* @Author wadu* @Date /1/19* @Version 1.0**/public class WordUtil {/*** word*/public final static String DOCX = ".docx";/*** word*/public final static String DOC = ".doc";public static void main(String[] args) {File file = new File("D:\\temp\\test.doc");File imageFolderFile = new File("D:\\temp\\images\\");wordToHtmlString(file, imageFolderFile);}public static String wordToHtmlString(String filePath, String imageFolderPath) {return wordToHtmlString(new File(filePath), new File(imageFolderPath));}public static String wordToString(String filePath) {return wordToString(new File(filePath));}/*** 从word获取带html格式的文本* @param file* @param imageFolderFile* @return*/public static String wordToHtmlString(File file, File imageFolderFile) {if (!file.exists()) {throw new WordExtractorException("file does not exists!");} else {if (!imageFolderFile.exists()) {imageFolderFile.mkdirs();}if (file.getName().toLowerCase().endsWith(DOCX)) {return wordToHtmlString(file, imageFolderFile);} else if(file.getName().toLowerCase().endsWith(DOC)){return wordToHtmlString(file, imageFolderFile);} else {throw new WordExtractorException("Only doc or docx files are supported");}}}/*** 从word获取不带格式的文本* @param file* @return*/public static String wordToString(File file) {if (!file.exists()) {throw new WordExtractorException("file does not exists!");} else {if (file.getName().toLowerCase().endsWith(DOCX)) {return wordToString(file);} else if(file.getName().toLowerCase().endsWith(DOC)){return wordToString(file);} else {throw new WordExtractorException("Only doc or docx files are supported");}}}/*** @param wordFile* @return*/private static String wordToString(File wordFile) {try(InputStream in = new FileInputStream(wordFile)) {StringBuilder result = new StringBuilder();XWPFDocument document = new XWPFDocument(in);XWPFWordExtractor re = new XWPFWordExtractor(document);result.append(re.getText());re.close();return result.toString();} catch (Exception e) {throw new WordExtractorException(e.getMessage());}}/*** @param wordFile* @return*/private static String wordToString(File wordFile) {try(InputStream in = new FileInputStream(wordFile)) {WordExtractor wordExtractor = new WordExtractor(in);return wordExtractor.getText();} catch (Exception e) {throw new WordExtractorException(e.getMessage());}}/**** @param wordFile* @param imageFolderFile* @return*/private static String wordToHtmlString(File wordFile, File imageFolderFile) {try (InputStream in = new FileInputStream(wordFile);XWPFDocument document = new XWPFDocument(in);ByteArrayOutputStream baos = new ByteArrayOutputStream()) {XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));options.setExtractor(new FileImageExtractor(imageFolderFile));options.setIgnoreStylesIfUnused(false);options.setFragment(true);XHTMLConverter.getInstance().convert(document, baos, options);return baos.toString();} catch (Exception e) {throw new WordExtractorException(e.getMessage());}}/**** @param wordFile* @param imageFolderFile* @return*/private static String wordToHtmlString(File wordFile, File imageFolderFile) {String absolutePath = imageFolderFile.getAbsolutePath();String imagePath = absolutePath.endsWith(File.separator) ? absolutePath : absolutePath + File.separator;try (InputStream input = new FileInputStream(wordFile);HWPFDocument wordDocument = new HWPFDocument(input);ByteArrayOutputStream baos = new ByteArrayOutputStream();OutputStream outStream = new BufferedOutputStream(baos)) {WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//图片存放的位置wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> {String imageFile = imagePath + suggestedName;File file = new File(imageFile);try {OutputStream os = new FileOutputStream(file);os.write(content);os.close();} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return imageFile;});//解析word文档wordToHtmlConverter.processDocument(wordDocument);Document htmlDocument = wordToHtmlConverter.getDocument();DOMSource domSource = new DOMSource(htmlDocument);StreamResult streamResult = new StreamResult(outStream);TransformerFactory factory = TransformerFactory.newInstance();Transformer serializer = factory.newTransformer();serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");serializer.setOutputProperty(OutputKeys.INDENT, "yes");serializer.setOutputProperty(OutputKeys.METHOD, "html");serializer.transform(domSource, streamResult);return baos.toString();} catch (Exception e) {throw new WordExtractorException(e.getMessage());}}}

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。