java使用POI实现html和word相互转换

湛蓝之海 发表于 2021-10-6 14:42:39

这篇文章主要为大家详细介绍了java使用POI实现html和word的相互转换，具有一定的参考价值，感兴趣的小伙伴们可以参考一下
项目后端使用了springboot，maven，前端使用了ckeditor富文本编辑器。目前从html转换的word为doc格式，而图片处理支持的是docx格式，所以需要手动把doc另存为docx，然后才可以进行图片替换。
一.添加maven依赖
主要使用了以下和poi相关的依赖，为了便于获取html的图片元素，还使用了jsoup：

<dependency>
<groupid>org.apache.poi</groupid>
<artifactid>poi</artifactid>
<version>3.14</version>
</dependency>

<dependency>
<groupid>org.apache.poi</groupid>
<artifactid>poi-scratchpad</artifactid>
<version>3.14</version>
</dependency>

<dependency>
<groupid>org.apache.poi</groupid>
<artifactid>poi-ooxml</artifactid>
<version>3.14</version>
</dependency>

<dependency>
<groupid>fr.opensagres.xdocreport</groupid>
<artifactid>xdocreport</artifactid>
<version>1.0.6</version>
</dependency>

<dependency>
<groupid>org.apache.poi</groupid>
<artifactid>poi-ooxml-schemas</artifactid>
<version>3.14</version>
</dependency>

<dependency>
<groupid>org.apache.poi</groupid>
<artifactid>ooxml-schemas</artifactid>
<version>1.3</version>
</dependency>

<dependency>
<groupid>org.jsoup</groupid>
<artifactid>jsoup</artifactid>
<version>1.11.3</version>
</dependency>
二.word转换为html
在springboot项目的resources目录下新建static文件夹，将需要转换的word文件temp.docx粘贴进去，由于static是springboot的默认资源文件，所以不需要在配置文件里面另行配置了，如果改成其他名字，需要在application.yml进行相应配置。
doc格式转换为html：

public static string doctohtml() throws exception {
file path = new file(resourceutils.geturl("classpath:").getpath());
string imagepathstr = path.getabsolutepath() + "\\static\\image\\";
string sourcefilename = path.getabsolutepath() + "\\static\\test.doc";
string targetfilename = path.getabsolutepath() + "\\static\\test2.html";
file file = new file(imagepathstr);
if(!file.exists()) {
file.mkdirs();
}
hwpfdocument worddocument = new hwpfdocument(new fileinputstream(sourcefilename));
org.w3c.dom.document document = documentbuilderfactory.newinstance().newdocumentbuilder().newdocument();
wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(document);
//保存图片，并返回图片的相对路径
wordtohtmlconverter.setpicturesmanager((content, picturetype, name, width, height) -> {
try (fileoutputstream out = new fileoutputstream(imagepathstr + name)) {
   out.write(content);
} catch (exception e) {
   e.printstacktrace();
}
return "image/" + name;
});
wordtohtmlconverter.processdocument(worddocument);
org.w3c.dom.document htmldocument = wordtohtmlconverter.getdocument();
domsource domsource = new domsource(htmldocument);
streamresult streamresult = new streamresult(new file(targetfilename));
transformerfactory tf = transformerfactory.newinstance();
transformer serializer = tf.newtransformer();
serializer.setoutputproperty(outputkeys.encoding, "utf-8");
serializer.setoutputproperty(outputkeys.indent, "yes");
serializer.setoutputproperty(outputkeys.method, "html");
serializer.transform(domsource, streamresult);
return targetfilename;
}
docx格式转换为html

public static string docxtohtml() throws exception {
file path = new file(resourceutils.geturl("classpath:").getpath());
string imagepath = path.getabsolutepath() + "\\static\\image";
string sourcefilename = path.getabsolutepath() + "\\static\\test.docx";
string targetfilename = path.getabsolutepath() + "\\static\\test.html";

outputstreamwriter outputstreamwriter = null;
try {
xwpfdocument document = new xwpfdocument(new fileinputstream(sourcefilename));
xhtmloptions options = xhtmloptions.create();
// 存放图片的文件夹
options.setextractor(new fileimageextractor(new file(imagepath)));
// html中图片的路径
options.uriresolver(new basicuriresolver("image"));
outputstreamwriter = new outputstreamwriter(new fileoutputstream(targetfilename), "utf-8");
xhtmlconverter xhtmlconverter = (xhtmlconverter) xhtmlconverter.getinstance();
xhtmlconverter.convert(document, outputstreamwriter, options);
} finally {
if (outputstreamwriter != null) {
   outputstreamwriter.close();
}
}
return targetfilename;
}
转换成功后会生成对应的html文件，如果想在前端展示，直接读取文件转换为string返回给前端即可。

public static string readfile(string filepath) {
file file = new file(filepath);
inputstream input = null;
try {
input = new fileinputstream(file);
} catch (filenotfoundexception e) {
e.printstacktrace();
}
stringbuffer buffer = new stringbuffer();
byte[] bytes = new byte;
try {
for (int n; (n = input.read(bytes)) != -1;) {
   buffer.append(new string(bytes, 0, n, "utf8"));
}
} catch (ioexception e) {
e.printstacktrace();
}
return buffer.tostring();
}
在富文本编辑器ckeditor中的显示效果：

三.html转换为word
实现思路就是先把html中的所有图片元素提取出来，统一替换为变量字符”${imgreplace}“，如果多张图片，可以依序排列下去，之后生成对应的doc文件（之前试过直接生成docx文件发现打不开，这个问题尚未找到好的解决方法），我们将其另存为docx文件，之后就可以替换变量为图片了：

public static string writewordfile(string content) {
string path = "d:/wordfile";
map<string, object> param = new hashmap<string, object>();

if (!"".equals(path)) {
   file filedir = new file(path);
   if (!filedir.exists()) {
   filedir.mkdirs();
   }
   content = htmlutils.htmlunescape(content);
   list<hashmap<string, string>> imgs = getimgstr(content);
   int count = 0;
   for (hashmap<string, string> img : imgs) {
   count++;
   //处理替换以“/>”结尾的img标签
   content = content.replace(img.get("img"), "${imgreplace" + count + "}");
   //处理替换以“>”结尾的img标签
   content = content.replace(img.get("img1"), "${imgreplace" + count + "}");
   map<string, object> header = new hashmap<string, object>();

   try {
      file filepath = new file(resourceutils.geturl("classpath:").getpath());
      string imagepath = filepath.getabsolutepath() + "\\static\\";
      imagepath += img.get("src").replaceall("/", "\\\\");
      //如果没有宽高属性，默认设置为400*300
      if(img.get("width") == null || img.get("height") == null) {
         header.put("width", 400);
         header.put("height", 300);
      }else {
         header.put("width", (int) (double.parsedouble(img.get("width"))));
         header.put("height", (int) (double.parsedouble(img.get("height"))));
      }
      header.put("type", "jpg");
      header.put("content", officeutil.inputstream2bytearray(new fileinputstream(imagepath), true));
   } catch (filenotfoundexception e) {
      e.printstacktrace();
   }
   param.put("${imgreplace" + count + "}", header);
   }
   try {
   // 生成doc格式的word文档，需要手动改为docx
   byte by[] = content.getbytes("utf-8");
   bytearrayinputstream bais = new bytearrayinputstream(by);
   poifsfilesystem poifs = new poifsfilesystem();
   directoryentry directory = poifs.getroot();
   documententry documententry = directory.createdocument("worddocument", bais);
   fileoutputstream ostream = new fileoutputstream("d:\\wordfile\\temp.doc");
   poifs.writefilesystem(ostream);
   bais.close();
   ostream.close();

   // 临时文件（手动改好的docx文件）
   customxwpfdocument doc = officeutil.generateword(param, "d:\\wordfile\\temp.docx");
   //最终生成的带图片的word文件
   fileoutputstream fopts = new fileoutputstream("d:\\wordfile\\final.docx");
   doc.write(fopts);
   fopts.close();
   } catch (exception e) {
   e.printstacktrace();
   }

}
return "d:/wordfile/final.docx";
}

//获取html中的图片元素信息
public static list<hashmap<string, string>> getimgstr(string htmlstr) {
list<hashmap<string, string>> pics = new arraylist<hashmap<string, string>>();

document doc = jsoup.parse(htmlstr);
elements imgs = doc.select("img");
for (element img : imgs) {
   hashmap<string, string> map = new hashmap<string, string>();
   if(!"".equals(img.attr("width"))) {
   map.put("width", img.attr("width").substring(0, img.attr("width").length() - 2));
   }
   if(!"".equals(img.attr("height"))) {
   map.put("height", img.attr("height").substring(0, img.attr("height").length() - 2));
   }
   map.put("img", img.tostring().substring(0, img.tostring().length() - 1) + "/>");
   map.put("img1", img.tostring());
   map.put("src", img.attr("src"));
   pics.add(map);
}
return pics;
}
officeutil工具类，之前发现网上的写法只支持一张图片的修改，多张图片就会报错，是因为添加了图片，processparagraphs方法中的runs的大小改变了，会报arraylist的异常，就和我们循环list中删除元素会报异常道理一样，解决方法就是复制一个新的arraylist进行循环即可：

package com.example.demo.util;

import java.io.bytearrayinputstream;
import java.io.fileinputstream;
import java.io.ioexception;
import java.io.inputstream;
import java.util.arraylist;
import java.util.iterator;
import java.util.list;
import java.util.map;
import java.util.map.entry;

import org.apache.poi.poixmldocument;
import org.apache.poi.hwpf.extractor.wordextractor;
import org.apache.poi.openxml4j.opc.opcpackage;
import org.apache.poi.xwpf.usermodel.xwpfparagraph;
import org.apache.poi.xwpf.usermodel.xwpfrun;
import org.apache.poi.xwpf.usermodel.xwpftable;
import org.apache.poi.xwpf.usermodel.xwpftablecell;
import org.apache.poi.xwpf.usermodel.xwpftablerow;

/**
* 适用于word 2007
*/
public class officeutil {

/**
* 根据指定的参数值、模板，生成 word 文档
* @param param 需要替换的变量
* @param template 模板
*/
public static customxwpfdocument generateword(map<string, object> param, string template) {
customxwpfdocument doc = null;
try {
   opcpackage pack = poixmldocument.openpackage(template);
   doc = new customxwpfdocument(pack);
   if (param != null && param.size() > 0) {

   //处理段落
   list<xwpfparagraph> paragraphlist = doc.getparagraphs();
   processparagraphs(paragraphlist, param, doc);

   //处理表格
   iterator<xwpftable> it = doc.gettablesiterator();
   while (it.hasnext()) {
      xwpftable table = it.next();
      list<xwpftablerow> rows = table.getrows();
      for (xwpftablerow row : rows) {
         list<xwpftablecell> cells = row.gettablecells();
         for (xwpftablecell cell : cells) {
         list<xwpfparagraph> paragraphlisttable = cell.getparagraphs();
         processparagraphs(paragraphlisttable, param, doc);
         }
      }
   }
   }
} catch (exception e) {
   e.printstacktrace();
}
return doc;
}
/**
* 处理段落
* @param paragraphlist
*/
public static void processparagraphs(list<xwpfparagraph> paragraphlist,map<string, object> param,customxwpfdocument doc){
if(paragraphlist != null && paragraphlist.size() > 0){
   for(xwpfparagraph paragraph:paragraphlist){
   //poi转换过来的行间距过大，需要手动调整
   if(paragraph.getspacingbefore() >= 1000 || paragraph.getspacingafter() > 1000) {
      paragraph.setspacingbefore(0);
      paragraph.setspacingafter(0);
   }
   //设置word中左右间距
   paragraph.setindentationleft(0);
   paragraph.setindentationright(0);
   list<xwpfrun> runs = paragraph.getruns();
   //加了图片，修改了paragraph的runs的size，所以循环不能使用runs
   list<xwpfrun> allruns = new arraylist<xwpfrun>(runs);
   for (xwpfrun run : allruns) {
      string text = run.gettext(0);
      if(text != null){
         boolean issettext = false;
         for (entry<string, object> entry : param.entryset()) {
         string key = entry.getkey();
         if(text.indexof(key) != -1){
            issettext = true;
            object value = entry.getvalue();
            if (value instanceof string) {//文本替换
               text = text.replace(key, value.tostring());
            } else if (value instanceof map) {//图片替换
               text = text.replace(key, "");
               map pic = (map)value;
               int width = integer.parseint(pic.get("width").tostring());
               int height = integer.parseint(pic.get("height").tostring());
               int pictype = getpicturetype(pic.get("type").tostring());
               byte[] bytearray = (byte[]) pic.get("content");
               bytearrayinputstream byteinputstream = new bytearrayinputstream(bytearray);
               try {
               string blipid = doc.addpicturedata(byteinputstream,pictype);
               doc.createpicture(blipid,doc.getnextpicnamenumber(pictype), width, height,paragraph);
               } catch (exception e) {
               e.printstacktrace();
               }
            }
         }
         }
         if(issettext){
         run.settext(text,0);
         }
      }
   }
   }
}
}
/**
* 根据图片类型，取得对应的图片类型代码
* @param pictype
* @return int
*/
private static int getpicturetype(string pictype){
int res = customxwpfdocument.picture_type_pict;
if(pictype != null){
   if(pictype.equalsignorecase("png")){
   res = customxwpfdocument.picture_type_png;
   }else if(pictype.equalsignorecase("dib")){
   res = customxwpfdocument.picture_type_dib;
   }else if(pictype.equalsignorecase("emf")){
   res = customxwpfdocument.picture_type_emf;
   }else if(pictype.equalsignorecase("jpg") || pictype.equalsignorecase("jpeg")){
   res = customxwpfdocument.picture_type_jpeg;
   }else if(pictype.equalsignorecase("wmf")){
   res = customxwpfdocument.picture_type_wmf;
   }
}
return res;
}
/**
* 将输入流中的数据写入字节数组
* @param in
* @return
*/
public static byte[] inputstream2bytearray(inputstream in,boolean isclose){
byte[] bytearray = null;
try {
   int total = in.available();
   bytearray = new byte;
   in.read(bytearray);
} catch (ioexception e) {
   e.printstacktrace();
}finally{
   if(isclose){
   try {
      in.close();
   } catch (exception e2) {
      system.out.println("关闭流失败");
   }
   }
}
return bytearray;
}
}
我认为之所以word2003不支持图片替换，主要是处理2003版本的hwpfdocument对象被声明为了final，我们就无法重写他的方法了。而处理2007版本的类为xwpfdocument，是可以继承的，通过继承xwpfdocument，重写createpicture方法即可实现图片替换，以下为对应的customxwpfdocument类：

package com.example.demo.util;

import java.io.ioexception;
import java.io.inputstream;
import org.apache.poi.openxml4j.opc.opcpackage;
import org.apache.poi.xwpf.usermodel.xwpfdocument;
import org.apache.poi.xwpf.usermodel.xwpfparagraph;
import org.apache.xmlbeans.xmlexception;
import org.apache.xmlbeans.xmltoken;
import org.openxmlformats.schemas.drawingml.x2006.main.ctnonvisualdrawingprops;
import org.openxmlformats.schemas.drawingml.x2006.main.ctpositivesize2d;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingdrawing.ctinline;

/**
* 自定义 xwpfdocument，并重写 createpicture()方法
*/
public class customxwpfdocument extends xwpfdocument {
public customxwpfdocument(inputstream in) throws ioexception {
super(in);
}

public customxwpfdocument() {
super();
}

public customxwpfdocument(opcpackage pkg) throws ioexception {
super(pkg);
}

/**
* @param ind
* @param width 宽
* @param height 高
* @param paragraph 段落
*/
public void createpicture(string blipid, int ind, int width, int height,xwpfparagraph paragraph) {
final int emu = 9525;
width *= emu;
height *= emu;
ctinline inline = paragraph.createrun().getctr().addnewdrawing().addnewinline();
string picxml = ""
   + "<a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">"
   + "<a:graphicdata uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
   + " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
   + " <pic:nvpicpr>" + "    <pic:cnvpr id=\""
   + ind
   + "\" name=\"generated\"/>"
   + "    <pic:cnvpicpr/>"
   + " </pic:nvpicpr>"
   + " <pic:blipfill>"
   + "    <a:blip r:embed=\""
   + blipid
   + "\" xmlns:r=\"http://schemas.openxmlformats.org/officedocument/2006/relationships\"/>"
   + "    <a:stretch>"
   + "    <a:fillrect/>"
   + "    </a:stretch>"
   + " </pic:blipfill>"
   + " <pic:sppr>"
   + "    <a:xfrm>"
   + "    <a:off x=\"0\" y=\"0\"/>"
   + "    <a:ext cx=\""
   + width
   + "\" cy=\""
   + height
   + "\"/>"
   + "    </a:xfrm>"
   + "    <a:prstgeom prst=\"rect\">"
   + "    <a:avlst/>"
   + "    </a:prstgeom>"
   + " </pic:sppr>"
   + " </pic:pic>"
   + "</a:graphicdata>" + "</a:graphic>";

inline.addnewgraphic().addnewgraphicdata();
xmltoken xmltoken = null;
try {
   xmltoken = xmltoken.factory.parse(picxml);
} catch (xmlexception xe) {
   xe.printstacktrace();
}
inline.set(xmltoken);

inline.setdistt(0);
inline.setdistb(0);
inline.setdistl(0);
inline.setdistr(0);

ctpositivesize2d extent = inline.addnewextent();
extent.setcx(width);
extent.setcy(height);

ctnonvisualdrawingprops docpr = inline.addnewdocpr();
docpr.setid(ind);
docpr.setname("图片" + ind);
docpr.setdescr("测试");
}
}
以上就是通过poi实现html和word的相互转换，对于html无法转换为可读的docx这个问题尚未解决，如果大家有好的解决方法可以交流一下。
原文链接：https://blog.csdn.net/j1231230/article/details/80712531

http://www.zzvips.com/article/172749.html

页: [1]

CodeAE代码之家-专为程序员打造的技术家园！-网站地图

java使用POI实现html和word相互转换