Extracting Text From PDF File (C#.net) ~ BitsByta

The following code shows you how to extract text from a pdf file... you would have to add references of PDFJet.Net and itextSharp.

using iTextSharp.text;

using iTextSharp.text.pdf;

using System.IO;

using PDFjet.NET;

private void Form1_Load(object sender, EventArgs e)

{

Document newDoc = new Document(PageSize.A2.Rotate());

try

{

richTextBox1.Text= extractPDFText(@"D:\Osama_Stuff\apache20.pdf", 3, 4);

}

catch (DocumentException de)

{

MessageBox.Show(de.Message);

}

catch (IOException ioEx)

{

MessageBox.Show(ioEx.Message);

}

finally

{

newDoc.Close();

}

public string extractPDFText(string sourcePDF, int fromPageNo, int toPageNo)

{

StringBuilder sb = new StringBuilder();

PdfReader reader = new PdfReader(sourcePDF);

byte[] pageBytes = null;

PRTokeniser token = null;

int tokenType = -1;

string tokenValue = string.Empty;

if (fromPageNo == 0)

{

fromPageNo = 1;

}

if (toPageNo == 0)

{

toPageNo = reader.NumberOfPages;

}

for (int i = fromPageNo; i < toPageNo; i++)

{

pageBytes = reader.GetPageContent(i);

if (pageBytes != null)

{

token = new PRTokeniser(pageBytes);

while (token.NextToken())

{

tokenType = (int)token.TokenType;

tokenValue = token.StringValue;

if (tokenType == (int)PRTokeniser.TokType.STRING)

{

sb.Append(token.StringValue);

}

else if (tokenType == 1 && tokenValue.Equals("-600"))

{

sb.Append(" ");

}

else if (tokenType == 10 && tokenValue.Equals("TJ"))

{

sb.Append(" ");

}

return sb.ToString();

}

BitsByta

Extracting Text From PDF File (C#.net)

0 comments:

Follow us on FaceBook

Labels

Popular Posts

Speciality

Gallery

About 99ProjectIdeas.com