|
@@ -1,6 +1,10 @@
|
|
|
package cn.cslg.pas.service;
|
|
|
|
|
|
-import org.apache.pdfbox.cos.COSName;
|
|
|
+import com.spire.pdf.PdfDocument;
|
|
|
+import com.spire.pdf.PdfPageBase;
|
|
|
+import org.apache.pdfbox.contentstream.operator.Operator;
|
|
|
+import org.apache.pdfbox.cos.*;
|
|
|
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
import org.apache.pdfbox.pdmodel.PDResources;
|
|
@@ -15,68 +19,34 @@ import org.springframework.boot.test.context.SpringBootTest;
|
|
|
import javax.imageio.ImageIO;
|
|
|
import java.io.File;
|
|
|
import java.io.FileInputStream;
|
|
|
-import java.io.FileWriter;
|
|
|
import java.io.IOException;
|
|
|
+import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
|
|
|
|
@SpringBootTest
|
|
|
public class PDFBoxTests {
|
|
|
+ private Integer numM = 0;
|
|
|
+
|
|
|
@Test
|
|
|
- public void test(){
|
|
|
- File file = new File("D:\\PAS\\target\\file\\20230913\\0a01ae69c6b540789ae7848157433cf7.pdf");
|
|
|
+ public void getPictrueFromPDF() {
|
|
|
+ File file = new File("D:\\PAS\\target\\file\\2023\\aaa.pdf");
|
|
|
FileInputStream in = null;
|
|
|
try {
|
|
|
-
|
|
|
- // 获取解析后得到的PDF文档对象
|
|
|
PDDocument pdfdocument = PDDocument.load(file);
|
|
|
-
|
|
|
- int pageNumbers=pdfdocument.getNumberOfPages();
|
|
|
- System.out.println("PDF总页数:"+pageNumbers);
|
|
|
- //新建一个PDF文本剥离器
|
|
|
- PDFTextStripper stripper = new PDFTextStripper();
|
|
|
- //sort设置为true 则按照行进行读取,默认是false
|
|
|
- stripper.setSortByPosition(true);
|
|
|
-
|
|
|
-
|
|
|
- for(int i=1;i<=pdfdocument.getNumberOfPages();i++)
|
|
|
- {
|
|
|
- //读取文字
|
|
|
- System.out.println("第 " + i + " 页 " );
|
|
|
- stripper.setStartPage(i);
|
|
|
- stripper.setEndPage(i);
|
|
|
- System.out.println("PDF文件的文本内容如下:");
|
|
|
-
|
|
|
-
|
|
|
- //读取图片
|
|
|
- PDPage page=pdfdocument.getPage(i-1);
|
|
|
+ for (int i = 2; i <= pdfdocument.getNumberOfPages(); i++) {
|
|
|
+ PDPage page = pdfdocument.getPage(i - 1);
|
|
|
PDResources pdResources = page.getResources();
|
|
|
- int index=1;
|
|
|
- //获取页中的对象
|
|
|
- System.out.println("PDF文件的图片:");
|
|
|
- Iterable<COSName> cosNames =pdResources.getXObjectNames();
|
|
|
- for(COSName csName : cosNames)
|
|
|
- {
|
|
|
- PDXObject pdxObject = pdResources.getXObject(csName);
|
|
|
- if(pdxObject instanceof PDImageXObject){
|
|
|
- PDStream pdStream = pdxObject.getStream();
|
|
|
- PDImageXObject image = new PDImageXObject(pdStream, pdResources);
|
|
|
- File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023"+File.separator+"page%d-image%d.jpeg", i,index++));
|
|
|
- ImageIO.write(image.getImage(), "jpeg", imgFile);
|
|
|
- }
|
|
|
- else if(pdxObject instanceof PDFormXObject)
|
|
|
- {
|
|
|
- ((PDFormXObject) pdxObject).getResources();
|
|
|
- PDStream pdStream = pdxObject.getStream();
|
|
|
- PDImageXObject image = new PDImageXObject(pdStream, pdResources);
|
|
|
- File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023"+File.separator+"page%d-image%d.jpeg", i,index++));
|
|
|
- ImageIO.write(image.getImage(), "jpeg", imgFile);
|
|
|
+ List<PDResources> resources = new ArrayList<>();
|
|
|
+ resources.add(pdResources);
|
|
|
+ while (resources.size() > 0) {
|
|
|
+ PDResources resource = resources.remove(0);
|
|
|
+ List<PDResources> pdResources1 = getPictureFromResources(resource);
|
|
|
+ if (pdResources1 != null && pdResources1.size() > 0) {
|
|
|
+ resources.addAll(pdResources1);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
}
|
|
|
- }
|
|
|
- catch (Exception e) {
|
|
|
- System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
|
|
|
+ } catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
|
} finally {
|
|
|
if (in != null) {
|
|
@@ -85,8 +55,36 @@ public class PDFBoxTests {
|
|
|
} catch (IOException e1) {
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<PDResources> getPictureFromResources(PDResources resources) throws IOException {
|
|
|
+ Boolean flag = true;
|
|
|
+ List<PDResources> resourceses = new ArrayList<>();
|
|
|
+ Iterable<COSName> cosNames = resources.getXObjectNames();
|
|
|
+ List<PDXObject> pdxObjects = new ArrayList<>();
|
|
|
+ for (COSName csName : cosNames) {
|
|
|
+ PDXObject pdxObject = resources.getXObject(csName);
|
|
|
+ if (pdxObject instanceof PDImageXObject) {
|
|
|
+ pdxObjects.add(pdxObject);
|
|
|
+ } else if (pdxObject instanceof PDFormXObject) {
|
|
|
+ PDFormXObject pdFormXObject = (PDFormXObject) pdxObject;
|
|
|
+ resourceses.add(pdFormXObject.getResources());
|
|
|
+ flag = false;
|
|
|
+ }
|
|
|
|
|
|
+ }
|
|
|
+ if(flag&&pdxObjects.size()>0){
|
|
|
+ this.addPdfImageTO(pdxObjects);
|
|
|
+ }
|
|
|
+ return resourceses;
|
|
|
+ }
|
|
|
|
|
|
+ public void addPdfImageTO(List<PDXObject> pdxObjects) throws IOException {
|
|
|
+ for (PDXObject pdxObject : pdxObjects) {
|
|
|
+ PDImageXObject pdImageXObject = (PDImageXObject) pdxObject;
|
|
|
+ File imgFile = new File(String.format("D:\\PAS\\target\\file\\2023" + File.separator + "page-image" + numM++ + ".jpg"));
|
|
|
+ ImageIO.write(pdImageXObject.getImage(), "jpg", imgFile);
|
|
|
}
|
|
|
}
|
|
|
}
|