|
@@ -1,13 +1,92 @@
|
|
|
package cn.cslg.pas.service;
|
|
|
|
|
|
+import org.apache.pdfbox.cos.COSName;
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
+import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
+import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
+import org.junit.Test;
|
|
|
import org.springframework.boot.test.context.SpringBootTest;
|
|
|
|
|
|
+import javax.imageio.ImageIO;
|
|
|
+import java.io.File;
|
|
|
+import java.io.FileInputStream;
|
|
|
+import java.io.FileWriter;
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
@SpringBootTest
|
|
|
public class PDFBoxTests {
|
|
|
+ @Test
|
|
|
public void test(){
|
|
|
+ File file = new File("D:\\PAS\\target\\file\\20230913\\0a01ae69c6b540789ae7848157433cf7.pdf");
|
|
|
+ FileInputStream in = null;
|
|
|
+ try {
|
|
|
+
|
|
|
+ // 获取解析后得到的PDF文档对象
|
|
|
+ PDDocument pdfdocument = PDDocument.load(file);
|
|
|
+
|
|
|
+ int pageNumbers=pdfdocument.getNumberOfPages();
|
|
|
+ System.out.println("PDF总页数:"+pageNumbers);
|
|
|
+ //新建一个PDF文本剥离器
|
|
|
+ PDFTextStripper stripper = new PDFTextStripper();
|
|
|
+ //sort设置为true 则按照行进行读取,默认是false
|
|
|
+ stripper.setSortByPosition(true);
|
|
|
+
|
|
|
+
|
|
|
+ for(int i=1;i<=pdfdocument.getNumberOfPages();i++)
|
|
|
+ {
|
|
|
+ //读取文字
|
|
|
+ System.out.println("第 " + i + " 页 " );
|
|
|
+ stripper.setStartPage(i);
|
|
|
+ stripper.setEndPage(i);
|
|
|
+ System.out.println("PDF文件的文本内容如下:");
|
|
|
+
|
|
|
|
|
|
+ //读取图片
|
|
|
+ PDPage page=pdfdocument.getPage(i-1);
|
|
|
+ PDResources pdResources = page.getResources();
|
|
|
+ int index=1;
|
|
|
+ //获取页中的对象
|
|
|
+ System.out.println("PDF文件的图片:");
|
|
|
+ Iterable<COSName> cosNames =pdResources.getXObjectNames();
|
|
|
+ for(COSName csName : cosNames)
|
|
|
+ {
|
|
|
+ PDXObject pdxObject = pdResources.getXObject(csName);
|
|
|
+ if(pdxObject instanceof PDImageXObject){
|
|
|
+ PDStream pdStream = pdxObject.getStream();
|
|
|
+ PDImageXObject image = new PDImageXObject(pdStream, pdResources);
|
|
|
+ File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023"+File.separator+"page%d-image%d.jpeg", i,index++));
|
|
|
+ ImageIO.write(image.getImage(), "jpeg", imgFile);
|
|
|
+ }
|
|
|
+ else if(pdxObject instanceof PDFormXObject)
|
|
|
+ {
|
|
|
+ ((PDFormXObject) pdxObject).getResources();
|
|
|
+ PDStream pdStream = pdxObject.getStream();
|
|
|
+ PDImageXObject image = new PDImageXObject(pdStream, pdResources);
|
|
|
+ File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023"+File.separator+"page%d-image%d.jpeg", i,index++));
|
|
|
+ ImageIO.write(image.getImage(), "jpeg", imgFile);
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
+ }
|
|
|
+ }
|
|
|
+ catch (Exception e) {
|
|
|
+ System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
|
|
|
+ e.printStackTrace();
|
|
|
+ } finally {
|
|
|
+ if (in != null) {
|
|
|
+ try {
|
|
|
+ in.close();
|
|
|
+ } catch (IOException e1) {
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
|
|
|
+ }
|
|
|
}
|
|
|
}
|