You are on page 1of 4

//Extracting text from a PDF on Cross-Platform (Core)

To extract text from a PDF document.

Text extraction reading ordering is not defined in the ISO PDF standard. In fact,
there is no concept of sentence, paragraph, tables, or anything similar in a
typical PDF file. This means each PDF vendor is left to their own design/solution
and will extract text with some differences. Therefore, reading order is not
guaranteed to match the order that a typical user reading the document would
follow.

The reading order of a magazine, newspaper article, and an academic article are all
quite different due to the lack of semantic information in a PDF and the
placement/ordering of text in the document. Where different users may have
different expectations of the correct reading order.

//C#

PDFDoc doc = new PDFDoc(filename)


Page page = doc.GetPage(1);

TextExtractor txt = new TextExtractor();


txt.Begin(page);

// Extract words one by one.


TextExtractor.Word word;
for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid();
line=line.GetNextLine())
{
for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
{
//word.GetString();
}
}

//Read a PDF File sample


Full sample code which illustrates the basic text extraction capabilities.

Extract text under an annotation


To extract text from under an annotation in the document.//

PDFDoc doc = new PDFDoc(filename)


Page page = doc.GetPage(1);
Annot annotation = page.GetAnnot(0);

TextExtractor txt = new TextExtractor();


txt.Begin(page); // Read the page.
string textData = txt.GetTextUnderAnnot(annotation);

//
c++

PDFDoc doc(filename);
Page page = doc.GetPage(1);

TextExtractor txt;
txt.Begin(page); // Read the page.

// Extract words one by one.


TextExtractor::Line line = txt.GetFirstLine();
TextExtractor::Word word;
for (; line.IsValid(); line=line.GetNextLine())
{
for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
{
//word.GetString();
}
}

//Read a PDF File sample


Full sample code which illustrates the basic text extraction capabilities.

Extract text under an annotation


To extract text from under an annotation in the document.//

PDFDoc doc(filename);
Page page = doc.GetPage(1);
Annot annotation = page.GetAnnot(0);

TextExtractor txt;
txt.Begin(page); // Read the page.
UString textData = txt.GetTextUnderAnnot(annotation);

//Go

doc := NewPDFDoc(filename)
page := doc.GetPage(1)
annotation := page.GetAnnot(0)

txt := NewTextExtractor()
txt.Begin(page); // Read the page.
textData := txt.GetTextUnderAnnot(annotation)

//

java script

const doc = await PDFNet.PDFDoc.createFromURL(filename);


const page = await doc.getPage(1);

const txt = await PDFNet.TextExtractor.create();


const rect = await page.getCropBox();
txt.begin(page, rect); // Read the page.

// Extract words one by one.


let line = await txt.getFirstLine();
for (; (await line.isValid()); line = (await line.getNextLine()))
{
for (word = await line.getFirstWord(); (await word.isValid()); word = (await
word.getNextWord()))
{
// await word.getString();
}
}

//

const doc = await PDFNet.PDFDoc.createFromURL(filename);


const page = await doc.getPage(1);
const annotation = await page.getAnnot(0);

const txt = await PDFNet.TextExtractor.create();


const rect = await page.getCropBox();
txt.begin(page, rect); // Read the page.
const textData = await txt.getTextUnderAnnot(annotation);

//Table extraction
The REST API demo is a post request to https://ai-
serve.pdftron.com/extract/predict. It will provide an HTML and XFDF in its
response.

Please visit our online table extraction demo to try out the PDFTron.AI tool in the
browser.
Here's an example code snippet for uploading a PDF to the demo using the API
endpoint://

file = new File([fileData], 'mypdf.pdf');


const xhttp = new XMLHttpRequest();
xhttp.onreadystatechange = () => this.handleResp(xhttp, originalFile, 'local');
const endpoint = 'https://ai-serve.pdftron.com/extract/predict';
xhttp.open('POST', endpoint, true);
xhttp.setRequestHeader("Content-type", "application/json");
xhttp.setRequestHeader("File-Name", originalName || 'mypdf.pdf')
xhttp.send(originalFile);

new

Set objFSO = CreateObject("Scripting.FileSystemObject")


objStartFolder = "PATH_OF_ALL_PDFS_YOU_WANT_TO_CONVERT_HERE"
Set objFolder = objFSO.GetFolder(objStartFolder)
Set colFiles = objFolder.Files
For Each objFile In colFiles
extension = Mid(objFile.Name, Len(objFile.Name) - 3, 4)
file = Mid(objFile.Name, 1, Len(objFile.Name) - 4)
fullname = objFSO.BuildPath(objStartFolder, objFile.Name)
fullname_txt = objFSO.BuildPath(objStartFolder, file + ".txt")

Set objFSO = CreateObject("Scripting.FileSystemObject")

If extension = ".pdf" And Not objFSO.FileExists(fullname_txt) Then


WScript.Echo fullname
Set WshShell = WScript.CreateObject("WScript.Shell")
WshShell.Run """" + fullname + """"
WScript.Sleep 1000
WshShell.SendKeys "%"
WScript.Sleep 100
WshShell.SendKeys "f"
WScript.Sleep 100
WshShell.SendKeys "h"
WScript.Sleep 100
WshShell.SendKeys "x"
WScript.Sleep 300
WshShell.SendKeys "{ENTER}"

count = 0
'this little step prevents the loop from moving on to the next .pdf before the
conversion to .txt is complete
Do While i = 0 And count < 100
On Error Resume Next
Set fso = CreateObject("Scripting.FileSystemObject")
Set MyFile = fso.OpenTextFile(fullname_txt, 8)
If Err.Number = 0 Then
i = 1
End If
count = count + 1
WScript.Sleep 20000
Loop
End If
Next

You might also like