Bootstrap

java实现word转html(支持docx及doc文件)

private final static String tempPath = "C:\\Users\\xxx\\Desktop\\Word2Html\\src\\test\\";//图片及相关文件保存的路径

    public static void main(String argv[]) {
        try {
            JFileChooser fileChooser = new JFileChooser();
            fileChooser.setDialogTitle("Select a Word Document");
            fileChooser.setAcceptAllFileFilterUsed(false);
            fileChooser.addChoosableFileFilter(new javax.swing.filechooser.FileNameExtensionFilter("Word Documents", "doc", "docx"));

            int returnValue = fileChooser.showOpenDialog(null);
            if (returnValue == JFileChooser.APPROVE_OPTION) {
                File inputFile = fileChooser.getSelectedFile();
                String fileName = inputFile.getAbsolutePath();

                String defaultOutputDir = System.getProperty("user.home") + "\\Desktop\\";
                String outputFileName = defaultOutputDir + inputFile.getName().replaceFirst("[.][^.]+$", "") + ".html";

                if (fileName.endsWith(".doc")) {
                    doc2Html(fileName, outputFileName);
                } else if (fileName.endsWith(".docx")) {
                    docx2Html(fileName, outputFileName);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * doc转换为html
     *
     * @param fileName
     * @param outPutFile
     * @throws TransformerException
     * @throws IOException
     * @throws ParserConfigurationException
     */
    public static void doc2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException {
        long startTime = System.currentTimeMillis();
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

        // 图片保存路径设置
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                String picturePath = "images" + File.separator + suggestedName;

                // 检查并创建图片文件夹
                File imageFolder = new File(tempPath + "images");
                if (!imageFolder.exists()) {
                    boolean created = imageFolder.mkdirs(); // 创建文件夹
                    if (created) {
                        System.out.println("Images folder created at: " + imageFolder.getAbsolutePath());
                    } else {
                        System.out.println("Failed to create images folder.");
                    }
                }

                // 写入图片数据,确保每次写入
                try {
                    File pictureFile = new File(tempPath + picturePath);
                    try (FileOutputStream fos = new FileOutputStream(pictureFile)) {
                        fos.write(content);  // 写入图片数据
                        System.out.println("Image saved to: " + pictureFile.getAbsolutePath());
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }

                return picturePath; // 返回相对路径
            }
        });

        wordToHtmlConverter.processDocument(wordDocument);

        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();

        String htmlContent = new String(out.toByteArray());

        htmlContent = htmlContent.replaceAll("TOC \\\\o \"1-3\" \\\\h \\\\z \\\\u", "");

        writeFile(htmlContent, outPutFile);
        System.out.println("Generate " + outPutFile + " with " + (System.currentTimeMillis() - startTime) + " ms.");
    }

    /**
     * 写文件
     *
     * @param content
     * @param path
     */
    public static void writeFile(String content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8"));
            bw.write(content);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null) bw.close();
                if (fos != null) fos.close();
            } catch (IOException e) {
            }
        }
    }

    /**
     * docx格式word转换为html
     *
     * @param fileName
     * @param outPutFile
     * @throws TransformerException
     * @throws IOException
     * @throws ParserConfigurationException
     */
    public static void docx2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException {
        long startTime = System.currentTimeMillis();
        XWPFDocument document = new XWPFDocument(new FileInputStream(fileName));

        // 提取目录
        StringBuilder toc = new StringBuilder();
        toc.append("<div id='toc'>\n<h2>Table of Contents</h2>\n<ul>\n");

        // 遍历文档中的段落,查找标题并构建目录
        List<XWPFParagraph> paragraphs = document.getParagraphs();
        for (XWPFParagraph paragraph : paragraphs) {
            String style = paragraph.getStyle();  // 获取段落样式
            if (style != null && (style.equals("Heading 1") || style.equals("Heading 2") || style.equals("Heading 3"))) {
                String text = paragraph.getText();
                // 根据标题级别构建目录项
                toc.append("<li><a href='#" + text.hashCode() + "'>" + text + "</a></li>\n");
            }
        }

        toc.append("</ul>\n</div>\n");

        // 设置XHTMLOptions
        XHTMLOptions options = XHTMLOptions.create().indent(4);
        File imageFolder = new File(tempPath);
        options.setExtractor(new FileImageExtractor(imageFolder));
        options.URIResolver(new FileURIResolver(imageFolder));

        File outFile = new File(outPutFile);
        outFile.getParentFile().mkdirs();
        OutputStream out = new FileOutputStream(outFile);

        // Convert docx to XHTML
        XHTMLConverter.getInstance().convert(document, out, options);

        System.out.println("Generate " + outPutFile + " with " + (System.currentTimeMillis() - startTime) + " ms.");

        // 获取转换后的HTML内容
        String htmlContent = new String(((ByteArrayOutputStream) out).toByteArray(), "UTF-8");

        // 将TOC插入到HTML的开头
        htmlContent = toc + htmlContent;

        // 手动添加表格样式(边框)
        htmlContent = htmlContent.replaceAll("<table>", "<table style='border: 1px solid black; border-collapse: collapse;'>");
        htmlContent = htmlContent.replaceAll("<td>", "<td style='border: 1px solid black; padding: 5px;'>");
        htmlContent = htmlContent.replaceAll("<th>", "<th style='border: 1px solid black; padding: 5px;'>");

        // 写入到输出文件
        writeFile(htmlContent, outPutFile);
    }

pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>fxma</groupId>
	<artifactId>Word2Html</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>Word2Html</name>
	<url>http://maven.apache.org</url>

	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>

	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>

		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>2.4</version>
		</dependency>

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.8</version>
		</dependency>

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.8</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.8</version>
		</dependency>

		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>xdocreport</artifactId>
			<version>1.0.4</version>
		</dependency>
		
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>ooxml-schemas</artifactId>
			<version>1.1</version>
		</dependency>
		
	</dependencies>
</project>

 

 

;