C#解析PDF

8/3/2015来源:C#应用人气:1642

C#解析PDF

C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。

PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。

文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。

ITestSharp是一C#开源项目,PdfBox为java开源项目,借助于IKVM在.Net平台下有实现。

Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。

OCR使用的是asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。

附上ITestSharp、PdfBox对PDF的解析代码。

ITestSharp辅助类

  1 using System;  2 using System.Collections.Generic;  3 using System.Text;  4   5 using iTextSharp.text.pdf;  6 using iTextSharp.text.pdf.parser;  7 using System.IO;  8   9 namespace eyuan 10 { 11     public static class ITextSharpHandler 12     { 13         /// <summary> 14         /// 读取PDF文本内容 15         /// </summary> 16         /// <param name="fileName"></param> 17         /// <returns></returns> 18         public static string ReadPdf(string fileName) 19         { 20             if (!File.Exists(fileName)) 21             { 22                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 23                 return string.Empty; 24             } 25             // 26             string fileContent = string.Empty; 27             StringBuilder sbFileContent = new StringBuilder(); 28             //打开文件 29             PdfReader reader = null; 30             try 31             { 32                 reader = new PdfReader(fileName); 33             } 34             catch (Exception ex) 35             { 36                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 37  38                 if (reader != null) 39                 { 40                     reader.Close(); 41                     reader = null; 42                 } 43  44                 return string.Empty; 45             } 46  47             try 48             { 49                 //循环各页(索引从1开始) 50                 for (int i = 1; i <= reader.NumberOfPages; i++) 51                 { 52                     sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); 53  54                 } 55  56             } 57             catch (Exception ex) 58             { 59                 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 60  61             } 62             finally 63             { 64                 if (reader != null) 65                 { 66                     reader.Close(); 67                     reader = null; 68                 } 69             } 70             // 71             fileContent = sbFileContent.ToString(); 72             return fileContent; 73         } 74         /// <summary> 75         /// 获取PDF页数 76         /// </summary> 77         /// <param name="fileName"></param> 78         /// <returns></returns> 79         public static int GetPdfPageCount(string fileName) 80         { 81             if (!File.Exists(fileName)) 82             { 83                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 84                 return -1; 85             } 86             //打开文件 87             PdfReader reader = null; 88             try 89             { 90                 reader = new PdfReader(fileName); 91             } 92             catch (Exception ex) 93             { 94                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 95  96                 if (reader != null) 97                 { 98                     reader.Close(); 99                     reader = null;100                 }101 102                 return -1;103             }104             //105             return reader.NumberOfPages;106         }107     }108 }

PDFBox辅助类

 1 using org.pdfbox.pdmodel; 2 using org.pdfbox.util; 3 using System; 4 using System.Collections.Generic; 5 using System.IO; 6 using System.Text; 7  8 namespace eyuan 9 {10     public static class PdfBoxHandler11     {12         /// <summary>13         /// 使用PDFBox组件进行解析14         /// </summary>15         /// <param name="input">PDF文件路径</param>16         /// <returns>PDF文本内容</returns>17         public static string ReadPdf(string input)18         {19             if (!File.Exists(input))20             {21                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);22                 return null;23             }24             else25             {26                 PDDocument pdfdoc = null;27                 string strPDFText = null;28                 PDFTextStripper stripper = null;29 30                 try31                 {32                     //加载PDF文件33                     pdfdoc = PDDocument.load(input);34                 }35                 catch (Exception ex)36                 {37                     LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));38 39                     if (pdfdoc != null)40                     {41                         pdfdoc.close();42                         pdfdoc = null;43                     }44 45                     return null;46                 }47 48                 try49                 {50                     //解析PDF文件51                     stripper = new PDFTextStripper();52                     strPDFText = stripper.getText(pdfdoc);53 54                    55 56                 }57                 catch (Exception ex)58                 {59                     LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));60 61                 }62                 finally63                 {64                     if (pdfdoc != null)65                     {66                         pdfdoc.close();67                         pdfdoc = null;68                     }69                 }70 71                 return strPDFText;72             }73 74         }75     }76 }

另外附上PDF转Image,然后对Image进行OCR的代码。

转换PDF为Jpeg图片代码(GhostScript辅助类)

  1 using System;  2 using System.Collections;  3 using System.Collections.Generic;  4 using System.Runtime.InteropServices;  5 using System.Text;  6   7 namespace eyuan  8 {  9     public class GhostscriptHandler 10     { 11  12         #region GhostScript Import 13         /// <summary>创建Ghostscript的实例 14         /// This instance is passed to most other gsapi functions.  15         /// The caller_handle will be PRovided to callback functions.   16         ///  At this stage, Ghostscript supports only one instance. </summary>   17         /// <param name="pinstance"></param>   18         /// <param name="caller_handle"></param>   19         /// <returns></returns>    20         [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")] 21         private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle); 22         /// <summary>This is the important function that will perform the conversion 23         ///  24         /// </summary>   25         /// <param name="instance"></param>   26         /// <param name="argc"></param>   27         /// <param name="argv"></param>   28         /// <returns></returns>   29         [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")] 30         private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv); 31         /// <summary>   32         /// Exit the interpreter.  33         /// This must be called on shutdown if gsapi_init_with_args() has been called,  34         /// and just before gsapi_delete_instance(). 35         /// 退出 36         /// </summary>   37         /// <param name="instance"></param>   38         /// <returns></returns>   39         [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")] 40         private static extern int gsapi_exit(IntPtr instance); 41         /// <summary>   42         /// Destroy an instance