//Extracting text from a PDF on Cross-Platform (Core)

To extract text from a PDF document.

Text extraction reading ordering is not defined in the ISO PDF standard. In fact,
there is no concept of sentence, paragraph, tables, or anything similar in a
typical PDF file. This means each PDF vendor is left to their own design/solution
and will extract text with some differences. Therefore, reading order is not
guaranteed to match the order that a typical user reading the document would

The reading order of a magazine, newspaper article, and an academic article are all
quite different due to the lack of semantic information in a PDF and the
placement/ordering of text in the document. Where different users may have
different expectations of the correct reading order.


PDFDoc doc = new PDFDoc(filename)

Page page = doc.GetPage(1);

TextExtractor txt = new TextExtractor();


// Extract words one by one.

TextExtractor.Word word;
for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid();
for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())

//Read a PDF File sample

Full sample code which illustrates the basic text extraction capabilities.

Extract text under an annotation

To extract text from under an annotation in the document.//

PDFDoc doc = new PDFDoc(filename)

Page page = doc.GetPage(1);
Annot annotation = page.GetAnnot(0);

TextExtractor txt = new TextExtractor();

txt.Begin(page); // Read the page.
string textData = txt.GetTextUnderAnnot(annotation);


PDFDoc doc(filename);
Page page = doc.GetPage(1);

TextExtractor txt;
txt.Begin(page); // Read the page.

// Extract words one by one.

TextExtractor::Line line = txt.GetFirstLine();
TextExtractor::Word word;
for (; line.IsValid(); line=line.GetNextLine())
for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())

PDFDoc doc(filename);
Page page = doc.GetPage(1);
Annot annotation = page.GetAnnot(0);

TextExtractor txt;
txt.Begin(page); // Read the page.
UString textData = txt.GetTextUnderAnnot(annotation);


doc := NewPDFDoc(filename)
page := doc.GetPage(1)
annotation := page.GetAnnot(0)

txt := NewTextExtractor()
txt.Begin(page); // Read the page.
textData := txt.GetTextUnderAnnot(annotation)


java script

const doc = await PDFNet.PDFDoc.createFromURL(filename);

const page = await doc.getPage(1);

const txt = await PDFNet.TextExtractor.create();

const rect = await page.getCropBox();
txt.begin(page, rect); // Read the page.

// Extract words one by one.

let line = await txt.getFirstLine();
for (; (await line.isValid()); line = (await line.getNextLine()))
for (word = await line.getFirstWord(); (await word.isValid()); word = (await
// await word.getString();


const doc = await PDFNet.PDFDoc.createFromURL(filename);

const page = await doc.getPage(1);
const annotation = await page.getAnnot(0);

const txt = await PDFNet.TextExtractor.create();

const rect = await page.getCropBox();
txt.begin(page, rect); // Read the page.
const textData = await txt.getTextUnderAnnot(annotation);

//Table extraction
The REST API demo is a post request to https://ai- It will provide an HTML and XFDF in its

Please visit our online table extraction demo to try out the PDFTron.AI tool in the
Here's an example code snippet for uploading a PDF to the demo using the API

file = new File([fileData], 'mypdf.pdf');

const xhttp = new XMLHttpRequest();
xhttp.onreadystatechange = () => this.handleResp(xhttp, originalFile, 'local');
const endpoint = '';'POST', endpoint, true);
xhttp.setRequestHeader("Content-type", "application/json");
xhttp.setRequestHeader("File-Name", originalName || 'mypdf.pdf')


Set objFSO = CreateObject("Scripting.FileSystemObject")

Set objFolder = objFSO.GetFolder(objStartFolder)
Set colFiles = objFolder.Files
For Each objFile In colFiles
extension = Mid(objFile.Name, Len(objFile.Name) - 3, 4)
file = Mid(objFile.Name, 1, Len(objFile.Name) - 4)
fullname = objFSO.BuildPath(objStartFolder, objFile.Name)
fullname_txt = objFSO.BuildPath(objStartFolder, file + ".txt")

Set objFSO = CreateObject("Scripting.FileSystemObject")

If extension = ".pdf" And Not objFSO.FileExists(fullname_txt) Then

WScript.Echo fullname
Set WshShell = WScript.CreateObject("WScript.Shell")
WshShell.Run """" + fullname + """"
WScript.Sleep 1000
WshShell.SendKeys "%"
WScript.Sleep 100
WshShell.SendKeys "f"
WScript.Sleep 100
WshShell.SendKeys "h"
WScript.Sleep 100
WshShell.SendKeys "x"
WScript.Sleep 300
WshShell.SendKeys "{ENTER}"

count = 0
'this little step prevents the loop from moving on to the next .pdf before the
conversion to .txt is complete
Do While i = 0 And count < 100
On Error Resume Next
Set fso = CreateObject("Scripting.FileSystemObject")
Set MyFile = fso.OpenTextFile(fullname_txt, 8)
If Err.Number = 0 Then
i = 1
End If
count = count + 1
WScript.Sleep 20000
End If

