我有一个pdf文件,其中包含60页.在每个页面中,我都使用Apache PDFBOX独特且重复发票编号.
import java.io.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.util.*;
import java.util.regex.*;
public class PDFtest1 {
public static void main(String[] args){
PDDocument pd;
try {
File input = new File("G:\\Sales.pdf");
// StringBuilder to store the extracted text
StringBuilder sb = new StringBuilder();
pd = PDDocument.load(input);
PDFTextStripper stripper = new PDFTextStripper();
// Add text to the StringBuilder from the PDF
sb.append(stripper.getText(pd));
Pattern p = Pattern.compile("Invoice No.\\s\\w\\d\\d\\d\\d\\d\\d\\d\\d\\d\\d");
// Matcher refers to the actual text where the pattern will be found
Matcher m = p.matcher(sb); …Run Code Online (Sandbox Code Playgroud)