This code snippet will explain how to extract text highlighted using the Text Highlight tool plug-in using Acrobat SDK. I assume that you've already know how to implement basic plug-in functionality using Acrobat SDK. The version of the SDK used in this code example is Acrobat XI SDK. I also assume following requirements,
- Read PDF 32000-1:2008 12.5.6.10, “Text Markup Annotations”, for further information
wwwimages.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf - You can use PDWordFinderAcquireWordList() method to get the total word list and then use a loop to create the CosObj Dic. Read my previous post about how to extract terms from PDF and create COS Dictionary.
Step 1: If you start with the BasicPlugin.cpp in the Acrobat SDK then you should have the following function when you click on your plugin from the menu bar,
ACCB1 void ACCB2 MyPluginCommand(void *clientData)
{
// get this plugin's name for display
ASAtom NameAtom = ASExtensionGetRegisteredName (gExtensionID);
const char * name = ASAtomGetString(NameAtom);
char str[256];
sprintf(str,"This menu item is added by plugin %s.\n", name);
// try to get front PDF document
AVDoc avDoc = AVAppGetActiveDoc();
//Display words of the pdf file.
PDDoc currentPDDoc =AVDocGetPDDoc(avDoc);
AVPageView currentPageView = AVDocGetPageView (avDoc);
ASInt32 pageNum = AVPageViewGetPageNum(currentPageView);
//Create a PDWordFinderConfigRec object;
PDWordFinderConfigRec pConfig;
//Set the DWordFinderConfigRec object's attributes
memset(&pConfig, 0, sizeof(PDWordFinderConfigRec));
pConfig.recSize = sizeof(PDWordFinderConfigRec);
pConfig.ignoreCharGaps = true;
pConfig.ignoreLineGaps = true;
pConfig.noAnnots = true;
pConfig.noEncodingGuess = true;
//Create a PDWordFinder object
PDWordFinder pdWordFinder = PDDocCreateWordFinderEx(currentPDDoc, WF_LATEST_VERSION, false, &pConfig);
//Create a callback function
PDWordProc wordProc = NULL;
wordProc= ASCallbackCreateProto(PDWordProc, &getHighlightedText);
//Extract and display words highlighted
PDWordFinderEnumWords(pdWordFinder, pageNum, wordProc, NULL);
PDWordFinderDestroy(pdWordFinder);
string strs = pdfCorpus.str();
const char* ps = strs.c_str();
AVAlertNote(ps);
if(avDoc==NULL) {
// if no doc is loaded, make a message.
strcat(str,"There is no PDF document loaded in Acrobat.");
}
else {
// if a PDF is open, get its number of pages
PDDoc pdDoc = AVDocGetPDDoc (avDoc);
int numPages = PDDocGetNumPages (pdDoc);
sprintf(str,"%sThe active PDF document has %d pages.", str, numPages);
}
}
Step 2: Now use the getHighlightText method to go through all the annotations and get PDTextSelect object.
ACCB1 ASBool ACCB2 getHighlightedText(PDWordFinder wObj, PDWord wInfo, ASInt32 pgNum, void *clientData)
{
char stringBuffer[100];
AVDoc avDoc = AVAppGetActiveDoc();
PDDoc currentPDDoc =AVDocGetPDDoc(avDoc);
CosDoc cd = PDDocGetCosDoc(currentPDDoc);
PDAnnot annot;
PDPage pdpage = PDDocAcquirePage(currentPDDoc, pgNum);
ASInt32 numAnnots =PDPageGetNumAnnots(pdpage);
ASFixedRect boundingRect; // bounding rectangle of the term
char * annBuf;
for(ASInt32 i = 0; i< numAnnots; i++){
annot = PDPageGetAnnot(pdpage, i);
if (ASAtomFromString("Highlight") == PDAnnotGetSubtype(annot))
{
// Gets the annotation's rect
PDAnnotGetRect(annot, &boundingRect);
// Gets the text selection from the annotation's rect
PDTextSelect textSelect = PDDocCreateTextSelect(currentPDDoc, pgNum, &boundingRect);
// create a callback to get the text from highlighted bounding box
PDTextSelectEnumText( textSelect , ASCallbackCreateProto(PDTextSelectEnumTextProc,&pdTextSelectEnumTextProc) , &annBuf );
}
}
return 0;
}
Step 3: Create a callback function to extract the text from the PDTextSelect object. Here, pdfCorpus is a stringstream so I can use that in another part of the code.
ACCB1 ASBool ACCB2 pdTextSelectEnumTextProc(void* procObj, PDFont font, ASFixed size, PDColorValue color, char* text,ASInt32 textLen){
char stringBuffer[200];
strcpy(stringBuffer,text);
pdfCorpus << stringBuffer;
return true ;
}
No comments:
Post a Comment