关于docx，html，xhtml,pdf直接转换

江南才子

近期由于项目的原因接触到word转html,pdf转html，之后在线编辑的模块，在网上找了许多资料，经过整理测试，已初具规模首先doc(docx)在线编辑1 推荐使用：zohowriter,无插件的web word编辑器2 推荐使用：docx4J 可以先把docx文档转换为html，package com.zoma.common;
import java.io.BufferedReader;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.DataOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.PrintWriter;
import org.docx4j.XmlUtils;import org.docx4j.convert.in.xhtml.XHTMLImporter;import org.docx4j.convert.out.html.HTMLConversionImageHandler;import org.docx4j.convert.out.html.HtmlExporterNonXSLT;import org.docx4j.openpackaging.exceptions.Docx4JException;import org.docx4j.openpackaging.packages.WordprocessingMLPackage;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.w3c.tidy.Tidy;
public class DocxUtil {
/ docx文档转换为html @param filepath --docx 文件路径（f:/1.docx） * @param outpath--生成html路径(f:1.html) @param imgpath--图片保存路径（f/img） @param imguri--图片引用（img/） * @return 转换成功返回true,失败返回false */public static boolean docToHtml(String filepath,String outpath,String imgpath,String imguri){boolean bo = false ;FileWriter fw = null;try {File infile = new File(filepath);File outfile = new File(outpath);WordprocessingMLPackage wmp=WordprocessingMLPackage.load(infile);HtmlExporterNonXSLT hn=new HtmlExporterNonXSLT(wmp, new HTMLConversionImageHandler(imgpath,imguri, true));String html=(XmlUtils.w3CDomNodeToString(hn.export()));fw=new FileWriter(outfile);fw.write(html);} catch (Exception e) {
e.printStackTrace();return bo ;}finally{try {fw.close();} catch (IOException e) {e.printStackTrace();}System.gc();}return bo ;}/** html转换为xhtml @param f_in --docx 文件路径（f:/1.html） * @param outfile--生成xhtml路径(f:1.xhtml) @return 转换成功返回true,失败返回false /public static boolean parseToXhtml(String f_in,String outfile){boolean bo = false; //BufferedInputStream sourceIn; //输入流 ByteArrayOutputStream tidyOutStream = null; //输出流 FileInputStream  fis  = null; ByteArrayOutputStream  bos  = null; ByteArrayInputStream stream = null; DataOutputStream  to = null; try {          // Reader reader;
   fis = new  FileInputStream(f_in); //读文件    bos =  new  ByteArrayOutputStream();    int  ch;    while((ch=fis.read())!=-1)    {          bos.write(ch);    }    byte[]  bs  =  bos.toByteArray();    bos.close();    String hope_gb2312=new String(bs,"UTF-8");//注意，默认是GB2312，所以这里先转化成GB2312然后再转化成其他的。          byte[] hope_b=hope_gb2312.getBytes();    String basil=new String(hope_b,"UTF-8");//将GB2312转化成 UTF-8          // byte[] basil_b=basil.getBytes();          stream= new ByteArrayInputStream(basil.getBytes());    tidyOutStream = new ByteArrayOutputStream();    Tidy tidy = new Tidy();    tidy.setInputEncoding("UTF-8");    tidy.setQuiet(true);          tidy.setOutputEncoding("UTF-8");          tidy.setShowWarnings(true); //不显示警告信息    tidy.setIndentContent(true);//    tidy.setSmartIndent(true);    tidy.setIndentAttributes(false);    tidy.setWraplen(1024); //多长换行    //输出为xhtml    tidy.setXHTML(true);    tidy.setErrout(new PrintWriter(System.out));    tidy.parse(stream, tidyOutStream);    to=new  DataOutputStream(new  FileOutputStream(outfile)); //将生成的xhtml写入    tidyOutStream.writeTo(to);    System.out.println(tidyOutStream.toString());    bo = true ;
} catch ( Exception ex ) {    System.out.println( ex.toString());    ex.printStackTrace();    return bo ; }finally{try {if(to!=null){to.close();}if(stream !=null){stream.close();}if(fis !=null){fis.close();}if(bos !=null){bos.close();}if(tidyOutStream !=null){tidyOutStream.close();}} catch (IOException e) {e.printStackTrace();}System.gc(); }
return bo;
}/** 获取html内容 @param filepath 文件路径（f:/1.xhtml） * @param exps 搜索表达式(html元素标签等) @return 搜索内容 /public static String getHtmlStyle(String filepath,String exps){String str="";try {File input = new File(filepath);Document doc = Jsoup.parse(input, "UTF-8");Elements els = null; if(exps.equals("body")){els= doc.body().children();}else{els= doc.select(exps);}for(Element el :els){str+=el;}} catch (IOException e) {e.printStackTrace();return str;}finally{System.gc();}  return str;}/** 修改html内容 @param filepath * @param exps @param htmls @return */public static boolean modifyHtml(String filepath,String exps,String htmls){boolean bo = false ;try {File input = new File(filepath);Document doc = Jsoup.parse(input, "UTF-8");if(exps.equals("body")){Element el = doc.body();el.html("");//el.children().html(htmls);//el.html(htmls);}//Elements els = doc.select(exps);//els.html(htmls);bo = true;} catch (IOException e) {e.printStackTrace();return bo ;}return bo ;}public static boolean modifyBody(String infile,String content,String outfile){File file = new File(infile); BufferedReader reader = null; FileWriter writer =null; String fileStr="" ; try {    System.out.println("以行为单位读取文件内容，一次读一整行：");    reader = new BufferedReader(new FileReader(file));    String tempString = null;
   // 一次读入一行，直到读入null为文件结束    while ((tempString = reader.readLine()) != null) {       // 显示行号       fileStr+=tempString;    }    String newStr = fileStr.substring(0,fileStr.indexOf("<body>"));    newStr+=content;    newStr+=fileStr.substring(fileStr.indexOf("</body>"),fileStr.length());    //打开一个写文件器，构造函数中的第二个参数true表示以追加形式写文件    writer = new FileWriter(outfile, true);    writer.write(newStr);
} catch (IOException e) {    e.printStackTrace(); } finally {try {if(writer !=null){writer.close();}} catch (IOException e) {e.printStackTrace();}    if (reader != null) {       try {       reader.close();       } catch (IOException e1) {       }    }
   System.gc(); }return false ;}public static boolean modifyHead(String infile,String content,String outfile){File file = new File(infile); BufferedReader reader = null; String fileStr="" ; try {    System.out.println("以行为单位读取文件内容，一次读一整行：");    reader = new BufferedReader(new FileReader(file));    String tempString = null;
   // 一次读入一行，直到读入null为文件结束    while ((tempString = reader.readLine()) != null) {       // 显示行号       fileStr+=tempString;    }    reader.close();    String newStr = fileStr.substring(0,fileStr.indexOf("<head>"));    newStr+="<head>";    newStr+="<meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />";    newStr+=content;    newStr+=fileStr.substring(fileStr.indexOf("</head>"),fileStr.length());    //打开一个写文件器，构造函数中的第二个参数true表示以追加形式写文件    FileWriter writer = new FileWriter(outfile, true);    writer.write(newStr);    writer.close(); } catch (IOException e) {    e.printStackTrace(); } finally {    if (reader != null) {       try {       reader.close();       } catch (IOException e1) {       }    } }return false ;}/** xhtml转换为docx文档 @param infile xhtml路径（f:/1.xhtml） * @param outfile docx生成路径(f:/1.docx) @return /public static boolean xhtmlToDocx(String infile,String outfile){boolean bo = false;try {WordprocessingMLPackage wxm=WordprocessingMLPackage.createPackage();wxm.getMainDocumentPart().getContent().addAll(XHTMLImporter.convert(new File(infile),null, wxm));wxm.save(new File(outfile));} catch (Docx4JException e) {e.printStackTrace();return bo ;} finally{System.gc();}return bo ;}}3 pdf 转html 推荐pdf2htmlex 高保真转化需要ubutun 12.04以上版本并且安装一下软件3.1 apt-get install python-software-properties3.2 sudo add-apt-repository ppa:coolwanglu/pdf2htmlex3.3 sudo apt-get update3.4 sudo apt-get install fontforge 3.5 sudo aptitude install poppler-utils 3.6 sudo apt-get install pdf2htmlex测试输入:pdf2htmlEX --zoom 1.3 /home/1.pdf --dest-dir /home/1会在home/1文件夹下生成html文件4 pdf合并使用pdfbox类库/    @param savepath 原来文件夹路径 * @param filePath合并后名字，临时PDF文件夹 生成新的pdf文件后，删除原有pdf @return * @throws COSVisitorException @throws IOException /  public static String mergePdfFiles(String savepath, String filePath) throws COSVisitorException, IOException { PDFMergerUtility mergePdf = new PDFMergerUtility(); List list = new ArrayList(); File dir = new File(savepath); System.out.println("------------merge savepath--"+savepath); System.out.println("------------merge dir--"+dir.getAbsolutePath()); System.out.println("------------merge to file--"+filePath); File file[] = dir.listFiles(); for (int i = 0; i < file.length; i++) {    if (file.isFile())    {list.add(file);    }
} System.out.println("--------------file--------list----------------------------"+list.size()); for(int i=0;i<list.size();i++) {File f = (File) list.get(i);InputStream is= new FileInputStream(f);mergePdf.addSource(is); }

mergePdf.setDestinationFileName(filePath);
mergePdf.mergeDocuments();
for(int i=0;i<list.size();i++) {File f = (File) list.get(i);f.deleteOnExit(); }
return filePath;  }5 jpg图片合并转pdf首先ubutun安装convert软件apt-get install p_w_picpathmagickapt-get install graphicsmagick-p_w_picpathmagick-compat使用命令convert /usr/*.jpg /usr/1.pdf

6java程序调研linux命令 Process proc = Runtime.getRuntime().exec(“”);

[办公软件] 关于docx，html，xhtml,pdf直接转换

相关帖子

教你使用HTML/CSS和Three.js的喷火龙小游戏

基于 HTML+CSS+JS 的石头剪刀布游戏

HTML/JS/PHP网站底部获取当前年份代码

html+Js获取当前时间代码

nginx启ssi方法，宝塔面板nginx开启ssi

常用HTML网页跳转的几种方法

Word页眉和空白页总删除不掉的解决方法

CSS3荧光灯文字闪烁动画特效

利用html+js触发弹窗事件

利用HTML、CSS和JS制作随机密码生成器

江南才子 LV4