The following code shows you how to extract text from a pdf file... you would have to add references of PDFJet.Net and itextSharp.
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;
using PDFjet.NET;
private void Form1_Load(object sender, EventArgs e)
{
Document newDoc = new Document(PageSize.A2.Rotate());
try
{
richTextBox1.Text= extractPDFText(@"D:\Osama_Stuff\apache20.pdf", 3, 4);
}
catch (DocumentException de)
{
MessageBox.Show(de.Message);
}
catch (IOException ioEx)
{
MessageBox.Show(ioEx.Message);
}
finally
{
newDoc.Close();
}
}
public string extractPDFText(string sourcePDF, int fromPageNo, int toPageNo)
{
StringBuilder sb = new StringBuilder();
PdfReader reader = new PdfReader(sourcePDF);
byte[] pageBytes = null;
PRTokeniser token = null;
int tokenType = -1;
string tokenValue = string.Empty;
if (fromPageNo == 0)
{
fromPageNo = 1;
}
if (toPageNo == 0)
{
toPageNo = reader.NumberOfPages;
}
for (int i = fromPageNo; i < toPageNo; i++)
{
pageBytes = reader.GetPageContent(i);
if (pageBytes != null)
{
token = new PRTokeniser(pageBytes);
while (token.NextToken())
{
tokenType = (int)token.TokenType;
tokenValue = token.StringValue;
if (tokenType == (int)PRTokeniser.TokType.STRING)
{
sb.Append(token.StringValue);
}
else if (tokenType == 1 && tokenValue.Equals("-600"))
{
sb.Append(" ");
}
else if (tokenType == 10 && tokenValue.Equals("TJ"))
{
sb.Append(" ");
}
}
}
}
return sb.ToString();
}
0 comments:
Post a Comment