Browse Source

3/12 从pdf抓取图片

lwhhszx 1 year ago
parent
commit
35ce28adb7

+ 93 - 0
src/main/java/cn/cslg/pas/service/common/PdfBoxService.java

@@ -0,0 +1,93 @@
+package cn.cslg.pas.service.common;
+
+
+import cn.cslg.pas.common.utils.FileUtils;
+import cn.cslg.pas.common.utils.UploadPatentBatchUtil;
+import cn.cslg.pas.common.vo.UploadSettingVO;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.junit.Test;
+import org.springframework.stereotype.Service;
+
+import javax.imageio.ImageIO;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * pdfbox相关接口
+ *
+ * @Author 李仁杰
+ * @Date 2024/3/12
+ */
+@Service
+public class PdfBoxService {
+    private Integer numM = 0;
+    public void getPictrueFromPDF() {
+        File file = new File("D:\\PAS\\target\\file\\2023\\aaa.pdf");
+        FileInputStream in = null;
+        try {
+            PDDocument pdfdocument = PDDocument.load(file);
+            for (int i = 2; i <= pdfdocument.getNumberOfPages(); i++) {
+                PDPage page = pdfdocument.getPage(i - 1);
+                PDResources pdResources = page.getResources();
+                List<PDResources> resources = new ArrayList<>();
+                resources.add(pdResources);
+                while (resources.size() > 0) {
+                    PDResources resource = resources.remove(0);
+                    List<PDResources> pdResources1 = getPictureFromResources(resource);
+                    if (pdResources1 != null && pdResources1.size() > 0) {
+                        resources.addAll(pdResources1);
+                    }
+                }
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        } finally {
+            if (in != null) {
+                try {
+                    in.close();
+                } catch (IOException e1) {
+                }
+            }
+        }
+    }
+
+    public List<PDResources> getPictureFromResources(PDResources resources) throws IOException {
+        Boolean flag = true;
+        List<PDResources> resourceses = new ArrayList<>();
+        Iterable<COSName> cosNames = resources.getXObjectNames();
+        List<PDXObject> pdxObjects = new ArrayList<>();
+        for (COSName csName : cosNames) {
+            PDXObject pdxObject = resources.getXObject(csName);
+            if (pdxObject instanceof PDImageXObject) {
+                pdxObjects.add(pdxObject);
+            } else if (pdxObject instanceof PDFormXObject) {
+                PDFormXObject pdFormXObject = (PDFormXObject) pdxObject;
+                resourceses.add(pdFormXObject.getResources());
+                flag = false;
+            }
+
+        }
+        if(flag&&pdxObjects.size()>0){
+            this.addPdfImageTO(pdxObjects);
+        }
+        return resourceses;
+    }
+
+    public void addPdfImageTO(List<PDXObject> pdxObjects) throws IOException {
+        for (PDXObject pdxObject : pdxObjects) {
+            PDImageXObject pdImageXObject = (PDImageXObject) pdxObject;
+            File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023" + File.separator + "page-image" + numM++ + ".jpg"));
+            ImageIO.write(pdImageXObject.getImage(), "jpg", imgFile);
+        }
+    }
+
+}

+ 48 - 50
src/test/java/cn/cslg/pas/service/PDFBoxTests.java

@@ -1,6 +1,10 @@
 package cn.cslg.pas.service;
 
-import org.apache.pdfbox.cos.COSName;
+import com.spire.pdf.PdfDocument;
+import com.spire.pdf.PdfPageBase;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.cos.*;
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
@@ -15,68 +19,34 @@ import org.springframework.boot.test.context.SpringBootTest;
 import javax.imageio.ImageIO;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileWriter;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 
 @SpringBootTest
 public class PDFBoxTests {
+    private Integer numM = 0;
+
     @Test
-    public  void test(){
-        File file = new File("D:\\PAS\\target\\file\\20230913\\0a01ae69c6b540789ae7848157433cf7.pdf");
+    public void getPictrueFromPDF() {
+        File file = new File("D:\\PAS\\target\\file\\2023\\aaa.pdf");
         FileInputStream in = null;
         try {
-
-            // 获取解析后得到的PDF文档对象
             PDDocument pdfdocument = PDDocument.load(file);
-
-            int pageNumbers=pdfdocument.getNumberOfPages();
-            System.out.println("PDF总页数:"+pageNumbers);
-            //新建一个PDF文本剥离器
-            PDFTextStripper stripper = new PDFTextStripper();
-            //sort设置为true 则按照行进行读取,默认是false
-            stripper.setSortByPosition(true);
-
-
-            for(int i=1;i<=pdfdocument.getNumberOfPages();i++)
-            {
-                //读取文字
-                System.out.println("第 " + i + " 页 " );
-                stripper.setStartPage(i);
-                stripper.setEndPage(i);
-                System.out.println("PDF文件的文本内容如下:");
-
-
-                //读取图片
-                PDPage page=pdfdocument.getPage(i-1);
+            for (int i = 2; i <= pdfdocument.getNumberOfPages(); i++) {
+                PDPage page = pdfdocument.getPage(i - 1);
                 PDResources pdResources = page.getResources();
-                int index=1;
-                //获取页中的对象
-                System.out.println("PDF文件的图片:");
-                Iterable<COSName> cosNames =pdResources.getXObjectNames();
-                for(COSName csName : cosNames)
-                {
-                    PDXObject pdxObject = pdResources.getXObject(csName);
-                    if(pdxObject instanceof PDImageXObject){
-                        PDStream pdStream = pdxObject.getStream();
-                        PDImageXObject image = new PDImageXObject(pdStream, pdResources);
-                        File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023"+File.separator+"page%d-image%d.jpeg", i,index++));
-                        ImageIO.write(image.getImage(), "jpeg", imgFile);
-                    }
-                    else if(pdxObject instanceof PDFormXObject)
-                    {
-                        ((PDFormXObject) pdxObject).getResources();
-                        PDStream pdStream = pdxObject.getStream();
-                        PDImageXObject image = new PDImageXObject(pdStream, pdResources);
-                        File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023"+File.separator+"page%d-image%d.jpeg", i,index++));
-                        ImageIO.write(image.getImage(), "jpeg", imgFile);
+                List<PDResources> resources = new ArrayList<>();
+                resources.add(pdResources);
+                while (resources.size() > 0) {
+                    PDResources resource = resources.remove(0);
+                    List<PDResources> pdResources1 = getPictureFromResources(resource);
+                    if (pdResources1 != null && pdResources1.size() > 0) {
+                        resources.addAll(pdResources1);
                     }
                 }
-
             }
-        }
-        catch (Exception e) {
-            System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
+        } catch (Exception e) {
             e.printStackTrace();
         } finally {
             if (in != null) {
@@ -85,8 +55,36 @@ public class PDFBoxTests {
                 } catch (IOException e1) {
                 }
             }
+        }
+    }
+
+    public List<PDResources> getPictureFromResources(PDResources resources) throws IOException {
+        Boolean flag = true;
+        List<PDResources> resourceses = new ArrayList<>();
+        Iterable<COSName> cosNames = resources.getXObjectNames();
+        List<PDXObject> pdxObjects = new ArrayList<>();
+        for (COSName csName : cosNames) {
+            PDXObject pdxObject = resources.getXObject(csName);
+            if (pdxObject instanceof PDImageXObject) {
+                pdxObjects.add(pdxObject);
+            } else if (pdxObject instanceof PDFormXObject) {
+                PDFormXObject pdFormXObject = (PDFormXObject) pdxObject;
+                resourceses.add(pdFormXObject.getResources());
+                flag = false;
+            }
 
+        }
+        if(flag&&pdxObjects.size()>0){
+            this.addPdfImageTO(pdxObjects);
+        }
+        return resourceses;
+    }
 
+    public void addPdfImageTO(List<PDXObject> pdxObjects) throws IOException {
+        for (PDXObject pdxObject : pdxObjects) {
+            PDImageXObject pdImageXObject = (PDImageXObject) pdxObject;
+            File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023" + File.separator + "page-image" + numM++ + ".jpg"));
+            ImageIO.write(pdImageXObject.getImage(), "jpg", imgFile);
         }
     }
 }