private final static String tempPath = "C:\\Users\\xxx\\Desktop\\Word2Html\\src\\test\\";//图片及相关文件保存的路径
public static void main(String argv[]) {
try {
JFileChooser fileChooser = new JFileChooser();
fileChooser.setDialogTitle("Select a Word Document");
fileChooser.setAcceptAllFileFilterUsed(false);
fileChooser.addChoosableFileFilter(new javax.swing.filechooser.FileNameExtensionFilter("Word Documents", "doc", "docx"));
int returnValue = fileChooser.showOpenDialog(null);
if (returnValue == JFileChooser.APPROVE_OPTION) {
File inputFile = fileChooser.getSelectedFile();
String fileName = inputFile.getAbsolutePath();
String defaultOutputDir = System.getProperty("user.home") + "\\Desktop\\";
String outputFileName = defaultOutputDir + inputFile.getName().replaceFirst("[.][^.]+$", "") + ".html";
if (fileName.endsWith(".doc")) {
doc2Html(fileName, outputFileName);
} else if (fileName.endsWith(".docx")) {
docx2Html(fileName, outputFileName);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* doc转换为html
*
* @param fileName
* @param outPutFile
* @throws TransformerException
* @throws IOException
* @throws ParserConfigurationException
*/
public static void doc2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException {
long startTime = System.currentTimeMillis();
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
// 图片保存路径设置
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
String picturePath = "images" + File.separator + suggestedName;
// 检查并创建图片文件夹
File imageFolder = new File(tempPath + "images");
if (!imageFolder.exists()) {
boolean created = imageFolder.mkdirs(); // 创建文件夹
if (created) {
System.out.println("Images folder created at: " + imageFolder.getAbsolutePath());
} else {
System.out.println("Failed to create images folder.");
}
}
// 写入图片数据,确保每次写入
try {
File pictureFile = new File(tempPath + picturePath);
try (FileOutputStream fos = new FileOutputStream(pictureFile)) {
fos.write(content); // 写入图片数据
System.out.println("Image saved to: " + pictureFile.getAbsolutePath());
}
} catch (IOException e) {
e.printStackTrace();
}
return picturePath; // 返回相对路径
}
});
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
String htmlContent = new String(out.toByteArray());
htmlContent = htmlContent.replaceAll("TOC \\\\o \"1-3\" \\\\h \\\\z \\\\u", "");
writeFile(htmlContent, outPutFile);
System.out.println("Generate " + outPutFile + " with " + (System.currentTimeMillis() - startTime) + " ms.");
}
/**
* 写文件
*
* @param content
* @param path
*/
public static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null) bw.close();
if (fos != null) fos.close();
} catch (IOException e) {
}
}
}
/**
* docx格式word转换为html
*
* @param fileName
* @param outPutFile
* @throws TransformerException
* @throws IOException
* @throws ParserConfigurationException
*/
public static void docx2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException {
long startTime = System.currentTimeMillis();
XWPFDocument document = new XWPFDocument(new FileInputStream(fileName));
// 提取目录
StringBuilder toc = new StringBuilder();
toc.append("<div id='toc'>\n<h2>Table of Contents</h2>\n<ul>\n");
// 遍历文档中的段落,查找标题并构建目录
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
String style = paragraph.getStyle(); // 获取段落样式
if (style != null && (style.equals("Heading 1") || style.equals("Heading 2") || style.equals("Heading 3"))) {
String text = paragraph.getText();
// 根据标题级别构建目录项
toc.append("<li><a href='#" + text.hashCode() + "'>" + text + "</a></li>\n");
}
}
toc.append("</ul>\n</div>\n");
// 设置XHTMLOptions
XHTMLOptions options = XHTMLOptions.create().indent(4);
File imageFolder = new File(tempPath);
options.setExtractor(new FileImageExtractor(imageFolder));
options.URIResolver(new FileURIResolver(imageFolder));
File outFile = new File(outPutFile);
outFile.getParentFile().mkdirs();
OutputStream out = new FileOutputStream(outFile);
// Convert docx to XHTML
XHTMLConverter.getInstance().convert(document, out, options);
System.out.println("Generate " + outPutFile + " with " + (System.currentTimeMillis() - startTime) + " ms.");
// 获取转换后的HTML内容
String htmlContent = new String(((ByteArrayOutputStream) out).toByteArray(), "UTF-8");
// 将TOC插入到HTML的开头
htmlContent = toc + htmlContent;
// 手动添加表格样式(边框)
htmlContent = htmlContent.replaceAll("<table>", "<table style='border: 1px solid black; border-collapse: collapse;'>");
htmlContent = htmlContent.replaceAll("<td>", "<td style='border: 1px solid black; padding: 5px;'>");
htmlContent = htmlContent.replaceAll("<th>", "<th style='border: 1px solid black; padding: 5px;'>");
// 写入到输出文件
writeFile(htmlContent, outPutFile);
}
pom文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>fxma</groupId>
<artifactId>Word2Html</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>Word2Html</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>xdocreport</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.1</version>
</dependency>
</dependencies>
</project>