Extract text and images and insert into new PDF


How do I extract all the text and images from one PDF and draw it onto pages in a new PDF?


Here is a code sample (Delphi and C# examples show below) that demonstrates how to extract all of the text and images from one PDF and draw them on to as many pages as required in a new PDF.

Delphi Code:

  FH: Integer;
  PR: Integer;
  SL: TStringList;
  Data: string;
  Font: string;
  Color: string;
  Size: string;
  X1, Y1, X2, Y2, X3, Y3, X4, Y4: string;
  Text: string;
  X: Integer;
  IL: Integer;
  TextBlockLeft: Double;
  TextBlockTop: Double;
  PageNum: Integer;
  ImageData: string;
  ImageLeft, ImageTop, ImageWidth, ImageHeight: Double;

// Open the file in direct access mode and store the file handle
FH := QP.DAOpenFile('Xpod1228090001.pdf', '');

// Loop through all the pages
for PageNum := 1 to QP.DAGetPageCount(FH) do
  // Start a new document

  // Specify that images should be compressed

  // Get a page reference to the current page
  PR := QP.DAFindPage(FH, PageNum);

  // Create a string list to hold the text data
  SL := TStringList.Create;

    // Extract the text from the current page
    SL.Text := QP.DAExtractPageText(FH, PR, 4);

    // Add each block of text to the new documen
    for X := 0 to SL.Count - 1 do
      Data := SL[X];

      Font := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Font) + 1);
      Color := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Color) + 1);
      Size := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Size) + 1);

      X1 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X1) + 1);
      Y1 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y1) + 1);
      X2 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X2) + 1);
      Y2 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y2) + 1);
      X3 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X3) + 1);
      Y3 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y3) + 1);
      X4 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(X4) + 1);
      Y4 := Copy(Data, 1, Pos(',', Data) - 1);
      Delete(Data, 1, Length(Y4) + 1);

      Text := Copy(Data, 2, Length(Data) - 2);

      // Replace the utf-8 encoded TM symbol with the
      // PDF WinAnsi character code
      if Pos(#226#132#162, Text) > 0 then
        Text := StringReplace(Text, #226#132#162, #153,

      // Set the text size

      // Draw the text, shift up by the font's "descent" value
        StrToFloat(Y1) - QP.GetTextDescent,

  // Find all the images on the page
  IL := QP.DAGetPageImageList(FH, PR);

  // Loop through all the images
  for X := 1 to QP.DAGetImageListCount(FH, IL) do

    // Read the image data
    ImageData := QP.DAGetImageDataToString(FH, IL, X);

    // Add the image data to the new document
    QP.AddImageFromString(ImageData, 0);

    // Determine the location and size of the image on the page
    ImageLeft := QP.DAGetImageDblProperty(FH, IL, X, 501);
    ImageTop := QP.DAGetImageDblProperty(FH, IL, X, 502);
    ImageWidth := QP.DAGetImageDblProperty(FH, IL, X, 503) -
      QP.DAGetImageDblProperty(FH, IL, X, 501);
    ImageHeight := QP.DAGetImageDblProperty(FH, IL, X, 502) -
      QP.DAGetImageDblProperty(FH, IL, X, 508);

    // Draw the image onto the new document's page
    QP.DrawImage(ImageLeft, ImageTop, ImageWidth, ImageHeight);

  end;  // End image loop

  // Compress the page description commands

  // Save the file
  QP.SaveToFile('XPod-' + IntToStr(PageNum) + '.pdf');

  // Remove the document

end;  // End page loop

That is all that is required. This code could be enhanced further to replicate any bold or italic text using the same functions, but retrieving more of the text/font properties.

C# Code


int FileHandle = QP.DAOpenFile("C:\\Input.pdf", "");
int PageRef = 0;
string PageText="";
string Font = "";
string color = "";
string size = "";
string x1, y1, x2, y2, x3, y3, x4, y4;
string word = "";
byte[] ImageData;

for (int i = 1; i <= QP.DAGetPageCount(FileHandle); i++)
    // Create a emptly document in memory
    int NewId = QP.NewDocument();
    // Get Page reference
    PageRef = QP.DAFindPage(FileHandle, i);
    // Get Page Text in CSV format. with the following Format
  //Font Name, Text Color, Text Size, X1, Y1, X2, Y2, X3, Y3, X4, Y4, Text
    PageText = QP.DAExtractPageText(FileHandle, PageRef, 4);
    string[] TextLines = PageText.Split(Environment.NewLine.ToCharArray());

    foreach (string line in TextLines)
        // parse required for each line
        string[] DataFields = line.Split(",".ToCharArray());
        Font = DataFields[0];
        color = DataFields[1];
        size = DataFields[2].Trim();
        x1 = DataFields[3];
        y1 = DataFields[4];
        x2 = DataFields[5];
        y2 = DataFields[6];
        x3 = DataFields[7];
        y3 = DataFields[8];
        x4 = DataFields[9];
        y4 = DataFields[10];
        word = DataFields[11].Replace("\"", "");

        // Set text font size
        // Draw text to new docmuent
        QP.DrawText(Convert.ToDouble(x1), Convert.ToDouble(y1) - QP.GetTextDescent(), word);
      // Get image list reference
      int ImageListId = QP.DAGetPageImageList(FileHandle,PageRef);

    for (int j=1; j<= QP.DAGetImageListCount(FileHandle,ImageListId);j++)
        // Convet image data into byte array
        ImageData = QP.DAGetImageDataToString(FileHandle, ImageListId, j);
        QP.AddImageFromString(ImageData, 0);
        // Retrieve images top,left, height and width
        double imgLeft = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 501);
        double imgTop = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 502);
        double imgWidth = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 503) - QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 501);
        double imgHeight = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 502) - QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 508);

        // Draw image to new document
        QP.DrawImage(imgLeft, imgTop, imgWidth, imgHeight); 
      // Save each page to new file increment by one

