I assume that you've already know how to implement basic plug-in functionality using Acrobat SDK. The version of the SDK used in this code example is Acrobat XI SDK. I also assume following requirements,
- Read PDF 32000-1:2008 for further information
wwwimages.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf - Offset of a term is the index inside the PDF document. As an example, first term of the PDF document starts with 0, and next 1 so on.
ACCB1 void ACCB2 termExtractor() { // try to get front PDF document AVDoc avDoc = AVAppGetActiveDoc(); //Display words of the pdf file. PDDoc currentPDDoc =AVDocGetPDDoc(avDoc); AVPageView currentPageView = AVDocGetPageView (avDoc); ASInt32 pageNum = AVPageViewGetPageNum(currentPageView); //Create a PDWordFinderConfigRec object; PDWordFinderConfigRec pConfig; //Set the DWordFinderConfigRec object's attributes memset(&pConfig, 0, sizeof(PDWordFinderConfigRec)); pConfig.recSize = sizeof(PDWordFinderConfigRec); pConfig.ignoreCharGaps = true; pConfig.ignoreLineGaps = true; pConfig.noAnnots = true; pConfig.noEncodingGuess = true; //Create a PDWordFinder object PDWordFinder pdWordFinder = PDDocCreateWordFinderEx(currentPDDoc, WF_LATEST_VERSION, false, &pConfig); // Acquire all the terms inside the PDF page. ASInt32 numWords; PDWord wordInfo; PDWord *pXYSortTable; PDWordFinderAcquireWordList(pdWordFinder, pageNum,&wordInfo, &pXYSortTable, NULL, &nWords); // Create COS Dictionary to keep track of all the words and their offset. CosDoc cd; CosObj Dict; cd = PDDocGetCosDoc(currentPDDoc); Dict = CosNewDict(cd,false,nWords); PDWord pdNWord = PDWordFinderGetNthWord(pdWordFinder, nWordCounter ); for(int nWordCounter = 0; nWordCounter < nWords; nWordCounter++) { // Get the word as a string char stringBuffer[125]; PDWordGetString (pdNWord, stringBuffer, sizeof(stringBuffer)); pdfCorpus << stringBuffer; // Add each term into COS Dictionary to use it later with highlighting method // Offset is the location of each term in the document. First term offset is 0 and next term is 1 etc. bool keyExist = CosDictKnown(Dict,ASAtomFromString(stringBuffer)); if( keyExist == true) // To-do: Duplicate term { // To-do: catch duplilcates } else // new term { CosDictPut(Dict,ASAtomFromString(stringBuffer), CosNewInteger(cd,false,nWordCounter)); } }