+92 332 4229 857 99ProjectIdeas@Gmail.com

Extracting Text From PDF File (C#.net)




The following code shows you how to extract text from a pdf file... you would have to add references of PDFJet.Net and itextSharp.









using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;
using PDFjet.NET;

private void Form1_Load(object sender, EventArgs e)
     {
       Document newDoc = new Document(PageSize.A2.Rotate());
           
      try
        {
              
         richTextBox1.Text= extractPDFText(@"D:\Osama_Stuff\apache20.pdf", 3, 4);
        }
        catch (DocumentException de)
        {
              MessageBox.Show(de.Message);
        }
        catch (IOException ioEx)
        {
              MessageBox.Show(ioEx.Message);
        }
        finally
        {
              newDoc.Close();
        }

    }

  public string extractPDFText(string sourcePDF, int fromPageNo, int toPageNo)
   {
            StringBuilder sb = new StringBuilder();
            PdfReader reader = new PdfReader(sourcePDF);
            byte[] pageBytes = null;
            PRTokeniser token = null;
            int tokenType = -1;
            string tokenValue = string.Empty;
           
            if (fromPageNo == 0)
            {
                fromPageNo = 1;
            }

            if (toPageNo == 0)
            {
                toPageNo = reader.NumberOfPages;
            }

            for (int i = fromPageNo; i < toPageNo; i++)
            {
                pageBytes = reader.GetPageContent(i);
               
                if (pageBytes != null)
                {
                    token = new PRTokeniser(pageBytes);
                    while (token.NextToken())
                    {
                        tokenType = (int)token.TokenType;
                        tokenValue = token.StringValue;
                        if (tokenType == (int)PRTokeniser.TokType.STRING)
                        {
                            sb.Append(token.StringValue);
                        }
                        else if (tokenType == 1 && tokenValue.Equals("-600"))
                        {
                            sb.Append(" ");
                        }
                        else if (tokenType == 10 && tokenValue.Equals("TJ"))
                        {
                            sb.Append(" ");
                        }
                    }
                }
            }

            return sb.ToString();
    }

0 comments: