Do you own a Debenu Quick PDF Library 12/11/10/9/8/7? Upgrade to Debenu Quick PDF Library 13!

Foxit Quick PDF Library

Frequently Asked Question:

Return to FAQ Index

Extract text and images and insert into new PDF

Question

How do I extract all the text and images from one PDF and draw it onto pages in a new PDF?

Answer

Here is a code sample (Delphi and C# examples show below) that demonstrates how to extract all of the text and images from one PDF and draw them on to as many pages as required in a new PDF.

Delphi Code:

var
  FH: Integer;
  PR: Integer;
  SL: TStringList;
  Data: string;
  Font: string;
  Color: string;
  Size: string;
  X1, Y1, X2, Y2, X3, Y3, X4, Y4: string;
  Text: string;
  X: Integer;
  IL: Integer;
  TextBlockLeft: Double;
  TextBlockTop: Double;
  PageNum: Integer;
  ImageData: string;
  ImageLeft, ImageTop, ImageWidth, ImageHeight: Double;

// Open the file in direct access mode and store the file handle
FH := QP.DAOpenFile('Xpod1228090001.pdf', '');

// Loop through all the pages
for PageNum := 1 to QP.DAGetPageCount(FH) do
begin
  // Start a new document
  QP.NewDocument;

  // Specify that images should be compressed
  QP.CompressImages(1);

  // Get a page reference to the current page
  PR := QP.DAFindPage(FH, PageNum);

  // Create a string list to hold the text data
  SL := TStringList.Create;
  try

    // Extract the text from the current page
    SL.Text := QP.DAExtractPageText(FH, PR, 4);

    // Add each block of text to the new documen
    for X := 0 to SL.Count - 1 do
    begin
      Data := SL[X];

      Font := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Font) + 1);
      Color := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Color) + 1);
      Size := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Size) + 1);

      X1 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X1) + 1);
      Y1 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y1) + 1);
      X2 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X2) + 1);
      Y2 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y2) + 1);
      X3 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X3) + 1);
      Y3 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y3) + 1);
      X4 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X4) + 1);
      Y4 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y4) + 1);

      Text := Copy(Data, 2, Length(Data) - 2);

      // Replace the utf-8 encoded TM symbol with the
      // PDF WinAnsi character code
      if Pos(#226#132#162, Text) > 0 then
        Text := StringReplace(Text, #226#132#162, #153,
          [rfReplaceAll]);

      // Set the text size
      QP.SetTextSize(StrToFloat(Size));

      // Draw the text, shift up by the font's "descent" value
      QP.DrawText(StrToFloat(X1),
        StrToFloat(Y1) - QP.GetTextDescent,
        Text);
    end;
  finally
    SL.Free;
  end;

  // Find all the images on the page
  IL := QP.DAGetPageImageList(FH, PR);

  // Loop through all the images
  for X := 1 to QP.DAGetImageListCount(FH, IL) do
  begin

    // Read the image data
    ImageData := QP.DAGetImageDataToString(FH, IL, X);

    // Add the image data to the new document
    QP.AddImageFromString(ImageData, 0);

    // Determine the location and size of the image on the page
    ImageLeft := QP.DAGetImageDblProperty(FH, IL, X, 501);
    ImageTop := QP.DAGetImageDblProperty(FH, IL, X, 502);
    ImageWidth := QP.DAGetImageDblProperty(FH, IL, X, 503) -
      QP.DAGetImageDblProperty(FH, IL, X, 501);
    ImageHeight := QP.DAGetImageDblProperty(FH, IL, X, 502) -
      QP.DAGetImageDblProperty(FH, IL, X, 508);

    // Draw the image onto the new document's page
    QP.DrawImage(ImageLeft, ImageTop, ImageWidth, ImageHeight);

  end;  // End image loop

  // Compress the page description commands
  QP.CompressContent;

  // Save the file
  QP.SaveToFile('XPod-' + IntToStr(PageNum) + '.pdf');

  // Remove the document
  QP.RemoveDocument(QP.SelectedDocument);

end;  // End page loop

That is all that is required. This code could be enhanced further to replicate any bold or italic text using the same functions, but retrieving more of the text/font properties.

C# Code

QP.UnlockKey(".........LicenceKey.....");

int FileHandle = QP.DAOpenFile("C:\\Input.pdf", "");
int PageRef = 0;
string PageText="";
string Font = "";
string color = "";
string size = "";
string x1, y1, x2, y2, x3, y3, x4, y4;
string word = "";
byte[] ImageData;

for (int i = 1; i <= QP.DAGetPageCount(FileHandle); i++)
  {
    // Create a emptly document in memory
    int NewId = QP.NewDocument();
    QP.CompressImages(1);
    // Get Page reference
    PageRef = QP.DAFindPage(FileHandle, i);
    // Get Page Text in CSV format. with the following Format
  //Font Name, Text Color, Text Size, X1, Y1, X2, Y2, X3, Y3, X4, Y4, Text
    PageText = QP.DAExtractPageText(FileHandle, PageRef, 4);
    string[] TextLines = PageText.Split(Environment.NewLine.ToCharArray());

    foreach (string line in TextLines)
    {
      try
      {
        // parse required for each line
        string[] DataFields = line.Split(",".ToCharArray());
        Font = DataFields[0];
        color = DataFields[1];
        size = DataFields[2].Trim();
        x1 = DataFields[3];
        y1 = DataFields[4];
        x2 = DataFields[5];
        y2 = DataFields[6];
        x3 = DataFields[7];
        y3 = DataFields[8];
        x4 = DataFields[9];
        y4 = DataFields[10];
        word = DataFields[11].Replace("\"", "");

        // Set text font size
        QP.SetTextSize(Convert.ToDouble(size));
        // Draw text to new docmuent
        QP.DrawText(Convert.ToDouble(x1), Convert.ToDouble(y1) - QP.GetTextDescent(), word);
        }
        catch
        {;}
      }
      // Get image list reference
      int ImageListId = QP.DAGetPageImageList(FileHandle,PageRef);

    for (int j=1; j<= QP.DAGetImageListCount(FileHandle,ImageListId);j++)
      {
        // Convet image data into byte array
        ImageData = QP.DAGetImageDataToString(FileHandle, ImageListId, j);
        QP.AddImageFromString(ImageData, 0);
        // Retrieve images top,left, height and width
        double imgLeft = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 501);
        double imgTop = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 502);
        double imgWidth = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 503) - QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 501);
        double imgHeight = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 502) - QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 508);

        // Draw image to new document
        QP.DrawImage(imgLeft, imgTop, imgWidth, imgHeight); 
      }
      QP.CompressContent();
      // Save each page to new file increment by one
      QP.SaveToFile("C:\\XPod"+Convert.ToString(i)+".pdf");
      QP.RemoveDocument(QP.SelectDocument(NewId));
    }

© 2015 Debenu & Foxit. All rights reserved. AboutBuyContactBlogNewsletterSupportFAQProduct UpdatesForum