我想从PDF文件中提取图像.我尝试使用以下代码,它从PDF中完美地提取了一个jpeg图像.问题是如何从特定页面(例如第1页)或从其他页面中提取图像.我不想阅读整个PDF来搜索图像.
有什么建议?
提取图像的代码:
private void List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
{
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
iTextSharp.text.pdf.PdfObject PDFObj = null;
iTextSharp.text.pdf.PdfStream PDFStremObj = null;
try
{
RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
{
byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);
if ((bytes != null))
{
try
{
System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);
MS.Position = 0;
System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
pictureBox1.Image = ImgPDF;
MS.Close();
MS.Flush();
}
catch (Exception)
{
}
}
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
Run Code Online (Sandbox Code Playgroud)
我目前没有iTextSharp 4.0可用,所以这个代码的目标是5.2,但它对老版本也应该可以正常工作.此代码几乎直接来自此帖子,所以请查看该帖子以及进一步问题的回复.正如我在上面的评论中所说,您的代码正在查看文档透视图中的所有图像,而我链接的代码逐页进行.
请阅读所有的意见在其他职位,尤其是这一次它解释说,这仅适用于JPG图像.PDF支持很多不同类型的图像,除非你知道你只处理JPG,否则你需要添加更多代码.有关提示,请参阅此帖和此帖.
string testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Doc1.pdf");
string outputPath = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
int pageNum = 1;
PdfReader pdf = new PdfReader(testFile);
PdfDictionary pg = pdf.GetPageN(pageNum);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj == null) { return; }
foreach (PdfName name in xobj.Keys) {
PdfObject obj = xobj.Get(name);
if (!obj.IsIndirect()) { continue; }
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (!type.Equals(PdfName.IMAGE)) { continue; }
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if (bytes == null) { continue; }
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes)) {
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNum));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
var jpegEncoder = ImageCodecInfo.GetImageEncoders().ToList().Find(x => x.FormatID == ImageFormat.Jpeg.Guid);
img.Save(path, jpegEncoder, parms);
}
}
Run Code Online (Sandbox Code Playgroud)