关于docx,html,xhtml,pdf直接转换
近期由于项目的原因接触到word转html,pdf转html,之后在线编辑的模块,在网上找了许多资料,经过整理测试,已初具规模首先doc(docx)在线编辑1 推荐使用:zohowriter,无插件的web word编辑器2 推荐使用:docx4J 可以先把docx文档转换为html,package com.zoma.common;import java.io.BufferedReader;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.DataOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.PrintWriter;
import org.docx4j.XmlUtils;import org.docx4j.convert.in.xhtml.XHTMLImporter;import org.docx4j.convert.out.html.HTMLConversionImageHandler;import org.docx4j.convert.out.html.HtmlExporterNonXSLT;import org.docx4j.openpackaging.exceptions.Docx4JException;import org.docx4j.openpackaging.packages.WordprocessingMLPackage;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.w3c.tidy.Tidy;
public class DocxUtil {
/ docx文档转换为html @param filepath --docx 文件路径(f:/1.docx) * @param outpath--生成html路径(f:1.html) @param imgpath--图片保存路径(f/img) @param imguri--图片引用(img/) * @return 转换成功返回true,失败返回false */public static boolean docToHtml(String filepath,String outpath,String imgpath,String imguri){boolean bo = false ;FileWriter fw = null;try {File infile = new File(filepath);File outfile = new File(outpath);WordprocessingMLPackage wmp=WordprocessingMLPackage.load(infile);HtmlExporterNonXSLT hn=new HtmlExporterNonXSLT(wmp, new HTMLConversionImageHandler(imgpath,imguri, true));String html=(XmlUtils.w3CDomNodeToString(hn.export()));fw=new FileWriter(outfile);fw.write(html);} catch (Exception e) {
e.printStackTrace();return bo ;}finally{try {fw.close();} catch (IOException e) {e.printStackTrace();}System.gc();}return bo ;}/** html转换为xhtml @param f_in --docx 文件路径(f:/1.html) * @param outfile--生成xhtml路径(f:1.xhtml) @return 转换成功返回true,失败返回false /public static boolean parseToXhtml(String f_in,String outfile){boolean bo = false; //BufferedInputStream sourceIn; //输入流 ByteArrayOutputStream tidyOutStream = null; //输出流 FileInputStreamfis= null; ByteArrayOutputStreambos= null; ByteArrayInputStream stream = null; DataOutputStreamto = null; try { // Reader reader;
fis = newFileInputStream(f_in); //读文件 bos =newByteArrayOutputStream(); intch; while((ch=fis.read())!=-1) { bos.write(ch); } byte[]bs=bos.toByteArray(); bos.close(); String hope_gb2312=new String(bs,"UTF-8");//注意,默认是GB2312,所以这里先转化成GB2312然后再转化成其他的。 byte[] hope_b=hope_gb2312.getBytes(); String basil=new String(hope_b,"UTF-8");//将GB2312转化成 UTF-8 // byte[] basil_b=basil.getBytes(); stream= new ByteArrayInputStream(basil.getBytes()); tidyOutStream = new ByteArrayOutputStream(); Tidy tidy = new Tidy(); tidy.setInputEncoding("UTF-8"); tidy.setQuiet(true); tidy.setOutputEncoding("UTF-8"); tidy.setShowWarnings(true); //不显示警告信息 tidy.setIndentContent(true);// tidy.setSmartIndent(true); tidy.setIndentAttributes(false); tidy.setWraplen(1024); //多长换行 //输出为xhtml tidy.setXHTML(true); tidy.setErrout(new PrintWriter(System.out)); tidy.parse(stream, tidyOutStream); to=newDataOutputStream(newFileOutputStream(outfile)); //将生成的xhtml写入 tidyOutStream.writeTo(to); System.out.println(tidyOutStream.toString()); bo = true ;
} catch ( Exception ex ) { System.out.println( ex.toString()); ex.printStackTrace(); return bo ; }finally{try {if(to!=null){to.close();}if(stream !=null){stream.close();}if(fis !=null){fis.close();}if(bos !=null){bos.close();}if(tidyOutStream !=null){tidyOutStream.close();}} catch (IOException e) {e.printStackTrace();}System.gc(); }
return bo;
}/** 获取html内容 @param filepath 文件路径 (f:/1.xhtml) * @param exps 搜索表达式(html元素标签等) @return 搜索内容 /public static String getHtmlStyle(String filepath,String exps){String str="";try {File input = new File(filepath);Document doc = Jsoup.parse(input, "UTF-8");Elements els = null; if(exps.equals("body")){els= doc.body().children();}else{els= doc.select(exps);}for(Element el :els){str+=el;}} catch (IOException e) {e.printStackTrace();return str;}finally{System.gc();}return str;}/** 修改html内容 @param filepath * @param exps @param htmls @return */public static boolean modifyHtml(String filepath,String exps,String htmls){boolean bo = false ;try {File input = new File(filepath);Document doc = Jsoup.parse(input, "UTF-8");if(exps.equals("body")){Element el = doc.body();el.html("");//el.children().html(htmls);//el.html(htmls);}//Elements els = doc.select(exps);//els.html(htmls);bo = true;} catch (IOException e) {e.printStackTrace();return bo ;}return bo ;}public static boolean modifyBody(String infile,String content,String outfile){File file = new File(infile); BufferedReader reader = null; FileWriter writer =null; String fileStr="" ; try { System.out.println("以行为单位读取文件内容,一次读一整行:"); reader = new BufferedReader(new FileReader(file)); String tempString = null;
// 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { // 显示行号 fileStr+=tempString; } String newStr = fileStr.substring(0,fileStr.indexOf("<body>")); newStr+=content; newStr+=fileStr.substring(fileStr.indexOf("</body>"),fileStr.length()); //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 writer = new FileWriter(outfile, true); writer.write(newStr);
} catch (IOException e) { e.printStackTrace(); } finally {try {if(writer !=null){writer.close();}} catch (IOException e) {e.printStackTrace();} if (reader != null) { try { reader.close(); } catch (IOException e1) { } }
System.gc(); }return false ;}public static boolean modifyHead(String infile,String content,String outfile){File file = new File(infile); BufferedReader reader = null; String fileStr="" ; try { System.out.println("以行为单位读取文件内容,一次读一整行:"); reader = new BufferedReader(new FileReader(file)); String tempString = null;
// 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { // 显示行号 fileStr+=tempString; } reader.close(); String newStr = fileStr.substring(0,fileStr.indexOf("<head>")); newStr+="<head>"; newStr+="<meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />"; newStr+=content; newStr+=fileStr.substring(fileStr.indexOf("</head>"),fileStr.length()); //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 FileWriter writer = new FileWriter(outfile, true); writer.write(newStr); writer.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } }return false ;}/** xhtml转换为docx文档 @param infile xhtml路径(f:/1.xhtml) * @param outfile docx生成路径(f:/1.docx) @return /public static boolean xhtmlToDocx(String infile,String outfile){boolean bo = false;try {WordprocessingMLPackage wxm=WordprocessingMLPackage.createPackage();wxm.getMainDocumentPart().getContent().addAll(XHTMLImporter.convert(new File(infile),null, wxm));wxm.save(new File(outfile));} catch (Docx4JException e) {e.printStackTrace();return bo ;} finally{System.gc();}return bo ;}}3 pdf 转html 推荐pdf2htmlex 高保真转化需要ubutun 12.04以上版本并且安装一下软件3.1 apt-get install python-software-properties3.2 sudo add-apt-repository ppa:coolwanglu/pdf2htmlex3.3 sudo apt-get update3.4 sudo apt-get install fontforge 3.5 sudo aptitude install poppler-utils 3.6 sudo apt-get install pdf2htmlex测试输入:pdf2htmlEX --zoom 1.3 /home/1.pdf --dest-dir /home/1会在home/1文件夹下生成html文件4 pdf合并 使用pdfbox类库/ @param savepath 原来文件夹路径 * @param filePath合并后名字,临时PDF文件夹 生成新的pdf文件后,删除原有pdf @return * @throws COSVisitorException @throws IOException /public static String mergePdfFiles(String savepath, String filePath) throws COSVisitorException, IOException { PDFMergerUtility mergePdf = new PDFMergerUtility(); List list = new ArrayList(); File dir = new File(savepath); System.out.println("------------merge savepath--"+savepath); System.out.println("------------merge dir--"+dir.getAbsolutePath()); System.out.println("------------merge to file--"+filePath); File file[] = dir.listFiles(); for (int i = 0; i < file.length; i++) { if (file.isFile()) {list.add(file); }
} System.out.println("--------------file--------list----------------------------"+list.size()); for(int i=0;i<list.size();i++) {File f = (File) list.get(i);InputStream is= new FileInputStream(f);mergePdf.addSource(is); }
mergePdf.setDestinationFileName(filePath);
mergePdf.mergeDocuments();
for(int i=0;i<list.size();i++) {File f = (File) list.get(i);f.deleteOnExit(); }
return filePath;}5 jpg图片合并转pdf首先ubutun安装convert软件apt-get install p_w_picpathmagickapt-get install graphicsmagick-p_w_picpathmagick-compat使用命令convert /usr/*.jpg /usr/1.pdf
6java程序调研linux命令 Process proc = Runtime.getRuntime().exec(“”);
页:
[1]