I assume that you've already know how to implement basic plug-in functionality using Acrobat SDK. The version of the SDK used in this code example is Acrobat XI SDK. I also assume following requirements,
- Read PDF 32000-1:2008 for further information
wwwimages.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf - Offset of a term is the index inside the PDF document. As an example, first term of the PDF document starts with 0, and next 1 so on.
ACCB1 void ACCB2 termExtractor()
{
// try to get front PDF document
AVDoc avDoc = AVAppGetActiveDoc();
//Display words of the pdf file.
PDDoc currentPDDoc =AVDocGetPDDoc(avDoc);
AVPageView currentPageView = AVDocGetPageView (avDoc);
ASInt32 pageNum = AVPageViewGetPageNum(currentPageView);
//Create a PDWordFinderConfigRec object;
PDWordFinderConfigRec pConfig;
//Set the DWordFinderConfigRec object's attributes
memset(&pConfig, 0, sizeof(PDWordFinderConfigRec));
pConfig.recSize = sizeof(PDWordFinderConfigRec);
pConfig.ignoreCharGaps = true;
pConfig.ignoreLineGaps = true;
pConfig.noAnnots = true;
pConfig.noEncodingGuess = true;
//Create a PDWordFinder object
PDWordFinder pdWordFinder = PDDocCreateWordFinderEx(currentPDDoc, WF_LATEST_VERSION, false, &pConfig);
// Acquire all the terms inside the PDF page.
ASInt32 numWords;
PDWord wordInfo;
PDWord *pXYSortTable;
PDWordFinderAcquireWordList(pdWordFinder, pageNum,&wordInfo, &pXYSortTable, NULL, &nWords);
// Create COS Dictionary to keep track of all the words and their offset.
CosDoc cd;
CosObj Dict;
cd = PDDocGetCosDoc(currentPDDoc);
Dict = CosNewDict(cd,false,nWords);
PDWord pdNWord = PDWordFinderGetNthWord(pdWordFinder, nWordCounter );
for(int nWordCounter = 0; nWordCounter < nWords; nWordCounter++)
{
// Get the word as a string
char stringBuffer[125];
PDWordGetString (pdNWord, stringBuffer, sizeof(stringBuffer));
pdfCorpus << stringBuffer;
// Add each term into COS Dictionary to use it later with highlighting method
// Offset is the location of each term in the document. First term offset is 0 and next term is 1 etc.
bool keyExist = CosDictKnown(Dict,ASAtomFromString(stringBuffer));
if( keyExist == true) // To-do: Duplicate term
{
// To-do: catch duplilcates
}
else // new term
{
CosDictPut(Dict,ASAtomFromString(stringBuffer), CosNewInteger(cd,false,nWordCounter));
}
}