Red*_*Red 3 pdf fonts itextsharp pdf-extraction
我正在编写一个Web应用程序,它在PDF的每个页面的顶部提取一行.PDF来自产品的不同版本,可以通过许多PDF打印机,也可以使用不同的版本和不同的设置.
到目前为止,使用PDFSharp和iTextSharp我已经设法让它适用于所有版本的PDF.我的挂机是带有CID字体的文件(Identity-H).
我编写了一个部分解析器来查找字体表引用和文本块,但将这些转换为可读文本正在打败我.
有没有人有: - 一个解决CID字体的解析器(像这一个/sf/answers/121258581/); 或 - 一些示例代码,用于解析页面资源字典以查找页面字体并获取其ToUnicode流以帮助完成此示例(/sf/answers/283382991/)
我们必须使用iTextSharp 4.1来保留免费使用许可.
这是我的部分解析器.
public string ExtractTextFromCIDPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
// Holds the final result to be returned
string resultString = "";
// Are we in a block of text or not
bool blnInText = false;
// Holds each line of text before written to resultString
string phrase = "";
// Holds the 4-character hex codes as they are built
string hexCode = "";
// Are we in a font reference or not (much like a code block)
bool blnInFontRef = false;
// Holds the last font reference and therefore the CMAP table
// to be used for any text found after it
string currentFontRef = "";
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
switch (c)
{
case '<':
{
blnInText = true;
break;
}
case '>':
{
resultString = resultString + Environment.NewLine + phrase;
phrase = "";
blnInText = false;
break;
}
case 'T':
{
switch (((char)input[i + 1]).ToString().ToLower())
{
case "f":
{
// Tf represents the start of a font table reference
blnInFontRef = true;
currentFontRef = "";
break;
}
case "d":
{
// Td represents the end of a font table reference or
// the start of a text block
blnInFontRef = false;
break;
}
}
break;
}
default:
{
if (blnInText)
{
// We are looking for 4-character blocks of hex characters
// These will build up a number which refers to the index
// of the glyph in the CMAP table, which will give us the
// character
hexCode = hexCode + c;
if (hexCode.Length == 4)
{
// TODO - translate code to character
char translatedHexCode = c;
phrase = phrase + translatedHexCode;
// Blank it out ready for the next 4
hexCode = "";
}
}
else
{
if (blnInFontRef)
{
currentFontRef = currentFontRef + c;
}
}
break;
}
}
}
return resultString;
}
catch
{
return "";
}
}
Run Code Online (Sandbox Code Playgroud)
花了一段时间,但我终于有了一些代码来从Identity-H编码的PDF中读取纯文本.我在这里发布它是为了帮助别人,我知道会有办法改进它.例如,我没有触及字符映射(beginbfchar),我的范围实际上不是范围.我已经花了一个多星期的时间在这上面,除非我们点击不同的文件,否则无法证明时间的合理性.抱歉.
用法:
PdfDocument inputDocument = PDFHelpers.Open(physcialFilePath, PdfDocumentOpenMode.Import)
foreach (PdfPage page in inputDocument.Pages)
{
for (Int32 index = 0; index < page.Contents.Elements.Count; index++)
{
PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
String outputText = new PDFParser().ExtractTextFromPDFBytes(stream.Value).Replace(" ", String.Empty);
if (outputText == "" || outputText.Replace("\n\r", "") == "")
{
// Identity-H encoded file
string[] hierarchy = new string[] { "/Resources", "/Font", "/F*" };
List<PdfItem> fonts = PDFHelpers.FindObjects(hierarchy, page, true);
outputText = PDFHelpers.FromUnicode(stream, fonts);
}
}
}
Run Code Online (Sandbox Code Playgroud)
And the actual helper class, which I'll post in its entirety, because they are all used in the example, and be because I've found so few complete examples myself when I was trying to solve this issue. The helper uses both PDFSharp and iTextSharp to be able to able to open PDFs pre- and post-1.5, ExtractTextFromPDFBytes to read in a standard PDF, and my FindObjects (to search the document tree and return objects) and FromUnicode that takes encrypted texts and a fonts collection to translate it.
using PdfSharp.Pdf;
using PdfSharp.Pdf.Content;
using PdfSharp.Pdf.Content.Objects;
using System;
using System.Collections.Generic;
using System.IO;
namespace PdfSharp.Pdf.IO
{
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public class PDFHelpers
{
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(string PdfPath, PdfDocumentOpenMode openmode)
{
return Open(PdfPath, null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(string PdfPath, string password, PdfDocumentOpenMode openmode)
{
using (FileStream fileStream = new FileStream(PdfPath, FileMode.Open, FileAccess.Read))
{
int len = (int)fileStream.Length;
// TODO: Setting this byteArray causes the out of memory exception which is why we
// have the 70mb limit. Solve this and we can increase the file size limit
System.Diagnostics.Process proc = System.Diagnostics.Process.GetCurrentProcess();
long availableMemory = proc.PrivateMemorySize64 / 1024 / 1024; //Mb of RAM allocated to this process that cannot be shared with other processes
if (availableMemory < (fileStream.Length / 1024 / 1024))
{
throw new Exception("The available memory " + availableMemory + "Mb is not enough to open, split and save a file of " + fileStream.Length / 1024 / 1024);
}
try
{
Byte[] fileArray = new Byte[len];
fileStream.Read(fileArray, 0, len);
fileStream.Close();
fileStream.Dispose();
PdfDocument result = Open(fileArray, openmode);
if (result.FullPath == "")
{
// The file was converted to a v1.4 document and only exists as a document in memory
// Save over the original file so other references to the file get the compatible version
// TODO: It would be good if we could do this conversion without opening every document another 2 times
PdfDocument tempResult = Open(fileArray, PdfDocumentOpenMode.Modify);
iTextSharp.text.pdf.BaseFont bfR = iTextSharp.text.pdf.BaseFont.CreateFont(Environment.GetEnvironmentVariable("SystemRoot") + "\\fonts\\arial.ttf", iTextSharp.text.pdf.BaseFont.IDENTITY_H, iTextSharp.text.pdf.BaseFont.EMBEDDED);
bfR.Subset = false;
tempResult.Save(PdfPath);
tempResult.Close();
tempResult.Dispose();
result = Open(fileArray, openmode);
}
return result;
}
catch (OutOfMemoryException)
{
fileStream.Close();
fileStream.Dispose();
throw;
}
}
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(byte[] fileArray, PdfDocumentOpenMode openmode)
{
return Open(new MemoryStream(fileArray), null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(byte[] fileArray, string password, PdfDocumentOpenMode openmode)
{
return Open(new MemoryStream(fileArray), password, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(MemoryStream sourceStream, PdfDocumentOpenMode openmode)
{
return Open(sourceStream, null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(MemoryStream sourceStream, string password, PdfDocumentOpenMode openmode)
{
PdfDocument outDoc = null;
sourceStream.Position = 0;
try
{
outDoc = (password == null) ?
PdfReader.Open(sourceStream, openmode) :
PdfReader.Open(sourceStream, password, openmode);
sourceStream.Position = 0;
MemoryStream outputStream = new MemoryStream();
iTextSharp.text.pdf.PdfReader reader = (password == null) ?
new iTextSharp.text.pdf.PdfReader(sourceStream) :
new iTextSharp.text.pdf.PdfReader(sourceStream, System.Text.ASCIIEncoding.ASCII.GetBytes(password));
System.Collections.ArrayList fontList = iTextSharp.text.pdf.BaseFont.GetDocumentFonts(reader, 1);
}
catch (PdfSharp.Pdf.IO.PdfReaderException)
{
//workaround if pdfsharp doesn't support this pdf
sourceStream.Position = 0;
MemoryStream outputStream = new MemoryStream();
iTextSharp.text.pdf.PdfReader reader = (password == null) ?
new iTextSharp.text.pdf.PdfReader(sourceStream) :
new iTextSharp.text.pdf.PdfReader(sourceStream, System.Text.ASCIIEncoding.ASCII.GetBytes(password));
iTextSharp.text.pdf.PdfStamper pdfStamper = new iTextSharp.text.pdf.PdfStamper(reader, outputStream);
pdfStamper.FormFlattening = true;
pdfStamper.Writer.SetPdfVersion(iTextSharp.text.pdf.PdfWriter.PDF_VERSION_1_4);
pdfStamper.Writer.CloseStream = false;
pdfStamper.Close();
outDoc = PdfReader.Open(outputStream, openmode);
}
return outDoc;
}
/// <summary>
/// Uses a recurrsive function to step through the PDF document tree to find the specified objects.
/// </summary>
/// <param name="objectHierarchy">An array of the names of objects to look for in the tree. Wildcards can be used in element names, e.g., /F*. The order represents
/// a top-down hierarchy if followHierarchy is true.
/// If a single object is passed in array it should be in the level below startingObject, or followHierarchy set to false to find it anywhere in the tree</param>
/// <param name="startingObject">A PDF object to parse. This will likely be a document or a page, but could be any lower-level item</param>
/// <param name="followHierarchy">If true the order of names in the objectHierarchy will be used to search only that branch. If false the whole tree will be parsed for
/// any items matching those in objectHierarchy regardless of position</param>
static public List<PdfItem> FindObjects(string[] objectHierarchy, PdfItem startingObject, bool followHierarchy)
{
List<PdfItem> results = new List<PdfItem>();
FindObjects(objectHierarchy, startingObject, followHierarchy, ref results, 0);
return results;
}
static private void FindObjects(string[] objectHierarchy, PdfItem startingObject, bool followHierarchy, ref List<PdfItem> results, int Level)
{
PdfName[] keyNames = ((PdfDictionary)startingObject).Elements.KeyNames;
foreach (PdfName keyName in keyNames)
{
bool matchFound = false;
if (!followHierarchy)
{
// We need to check all items for a match, not just the top one
for (int i = 0; i < objectHierarchy.Length; i++)
{
if (keyName.Value == objectHierarchy[i] ||
(objectHierarchy[i].Contains("*") &&
(keyName.Value.StartsWith(objectHierarchy[i].Substring(0, objectHierarchy[i].IndexOf("*") - 1)) &&
keyName.Value.EndsWith(objectHierarchy[i].Substring(objectHierarchy[i].IndexOf("*") + 1)))))
{
matchFound = true;
}
}
}
else
{
// Check the item in the hierarchy at this level for a match
if (Level < objectHierarchy.Length && (keyName.Value == objectHierarchy[Level] ||
(objectHierarchy[Level].Contains("*") &&
(keyName.Value.StartsWith(objectHierarchy[Level].Substring(0, objectHierarchy[Level].IndexOf("*") - 1)) &&
keyName.Value.EndsWith(objectHierarchy[Level].Substring(objectHierarchy[Level].IndexOf("*") + 1))))))
{
matchFound = true;
}
}
if (matchFound)
{
PdfItem item = ((PdfDictionary)startingObject).Elements[keyName];
if (item != null && item is PdfSharp.Pdf.Advanced.PdfReference)
{
item = ((PdfSharp.Pdf.Advanced.PdfReference)item).Value;
}
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString() + " - " + keyName.ToString() + " matched");
if (Level == objectHierarchy.Length - 1)
{
// We are at the end of the hierarchy, so this is the target
results.Add(item);
}
else if (!followHierarchy)
{
// We are returning every matching object so add it
results.Add(item);
}
// Call back to this function to search lower levels
Level++;
FindObjects(objectHierarchy, item, followHierarchy, ref results, Level);
Level--;
}
else
{
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString() + " - " + keyName.ToString() + " unmatched");
}
}
Level--;
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString());
}
/// <summary>
/// Uses the Font object to translate CID encoded text to readable text
/// </summary>
/// <param name="unreadableText">The text stream that needs to be decoded</param>
/// <param name="font">A List of PDFItems containing the /Font object containing a /ToUnicode with a CMap</param>
static public string FromUnicode(PdfDictionary.PdfStream unreadableText, List<PdfItem> PDFFonts)
{
Dictionary<string, string[]> fonts = new Dictionary<string, string[]>();
// Get the CMap from each font in the passed array and store them by font name
for (int font = 0; font < PDFFonts.Count; font++)
{
PdfName[] keyNames = ((PdfDictionary)PDFFonts[font]).Elements.KeyNames;
foreach (PdfName keyName in keyNames)
{
if (keyName.Value == "/ToUnicode") {
PdfItem item = ((PdfDictionary)PDFFonts[font]).Elements[keyName];
if (item != null && item is PdfSharp.Pdf.Advanced.PdfReference)
{
item = ((PdfSharp.Pdf.Advanced.PdfReference)item).Value;
}
string FontName = "/F" + font.ToString();
string CMap = ((PdfDictionary)item).Stream.ToString();
if (CMap.IndexOf("beginbfrange") > 0)
{
CMap = CMap.Substring(CMap.IndexOf("beginbfrange") + "beginbfrange".Length);
if (CMap.IndexOf("endbfrange") > 0)
{
CMap = CMap.Substring(0, CMap.IndexOf("endbfrange") - 1);
string[] CMapArray = CMap.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
fonts.Add(FontName, CMapArray);
}
}
break;
}
}
}
// Holds the final result to be returned
string resultString = "";
// Break the input text into lines
string[] lines = unreadableText.ToString().Split(new string[] {"\n"} , StringSplitOptions.RemoveEmptyEntries);
// Holds the last font reference and therefore the CMAP table
// to be used for any text found after it
string[] currentFontRef = fonts["/F0"];
// Are we in a block of text or not? They can break across lines so we need an identifier
bool blnInText = false;
for (int line = 0; line < lines.Length; line++)
{
string thisLine = lines[line].Trim();
if (thisLine == "q")
{
// I think this denotes the start of a text block, and where we need to reset to the default font
currentFontRef = fonts["/F0"];
}
else if (thisLine.IndexOf(" Td <") != -1)
{
thisLine = thisLine.Substring(thisLine.IndexOf(" Td <") + 5);
blnInText = true;
}
if (thisLine.EndsWith("Tf"))
{
// This is a font assignment. Take note of this and use this fonts ToUnicode map when we find text
if (fonts.ContainsKey(thisLine.Substring(0, thisLine.IndexOf(" "))))
{
currentFontRef = fonts[thisLine.Substring(0, thisLine.IndexOf(" "))];
}
}
else if (thisLine.EndsWith("> Tj"))
{
thisLine = thisLine.Substring(0, thisLine.IndexOf("> Tj"));
}
if(blnInText)
{
// This is a text block
try
{
// Get the section of codes that exist between angled brackets
string unicodeStr = thisLine;
// Wrap every group of 4 characters in angle brackets
// This will directly match the items in the CMap but also allows the next for to avoid double-translating items
unicodeStr = "<" + String.Join("><", unicodeStr.SplitInParts(4)) + ">";
for (int transform = 0; transform < currentFontRef.Length; transform++)
{
// Get the last item in the line, which is the unicode value of the glyph
string glyph = currentFontRef[transform].Substring(currentFontRef[transform].IndexOf("<"));
glyph = glyph.Substring(0, glyph.IndexOf(">") + 1);
string counterpart = currentFontRef[transform].Substring(currentFontRef[transform].LastIndexOf("<") + 1);
counterpart = counterpart.Substring(0, counterpart.LastIndexOf(">"));
// Replace each item that matches with the translated counterpart
// Insert a \\u before every 4th character so it's a C# unicode compatible string
unicodeStr = unicodeStr.Replace(glyph, "\\u" + counterpart);
if (unicodeStr.IndexOf(">") == 0)
{
// All items have been replaced, so lets get outta here
break;
}
}
resultString = resultString + System.Text.RegularExpressions.Regex.Unescape(unicodeStr);
}
catch
{
return "";
}
}
if (lines[line].Trim().EndsWith("> Tj"))
{
blnInText = false;
if (lines[line].Trim().IndexOf(" 0 Td <") == -1)
{
// The vertical coords have changed, so add a new line
resultString = resultString + Environment.NewLine;
}
else
{
resultString = resultString + " ";
}
}
}
return resultString;
}
// Credit to http://stackoverflow.com/questions/4133377/
private static IEnumerable<String> SplitInParts(this String s, Int32 partLength)
{
if (s == null)
throw new ArgumentNullException("s");
if (partLength <= 0)
throw new ArgumentException("Part length has to be positive.", "partLength");
for (var i = 0; i < s.Length; i += partLength)
yield return s.Substring(i, Math.Min(partLength, s.Length - i));
}
}
}
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}
}
return false;
}
#endregion
}
Run Code Online (Sandbox Code Playgroud)
Thank you to all those who provided help and snippets that allowed me to finally pull a working solution together
| 归档时间: |
|
| 查看次数: |
4896 次 |
| 最近记录: |