FPDFTEXT

Files

file  fpdftext.h
 

Header file for the text module - for text extraction.


Functions

DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage (FPDF_PAGE page)
DLLEXPORT void STDCALL FPDFText_ClosePage (FPDF_TEXTPAGE text_page)
DLLEXPORT int STDCALL FPDFText_CountChars (FPDF_TEXTPAGE text_page)
DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode (FPDF_TEXTPAGE text_page, int index)
DLLEXPORT FPDF_BOOL STDCALL FPDFText_IsGenerated (FPDF_TEXTPAGE text_page, int index)
DLLEXPORT double STDCALL FPDFText_GetFontSize (FPDF_TEXTPAGE text_page, int index)
DLLEXPORT void STDCALL FPDFText_GetOrigin (FPDF_TEXTPAGE text_page, int index, double *x, double *y)
DLLEXPORT void STDCALL FPDFText_GetCharBox (FPDF_TEXTPAGE text_page, int index, double *left, double *right, double *bottom, double *top)
DLLEXPORT void STDCALL FPDFText_GetMatrix (FPDF_TEXTPAGE text_page, int index, double *a, double *b, double *c, double *d)
DLLEXPORT FPDF_FONT STDCALL FPDFText_GetFont (FPDF_TEXTPAGE text_page, int index)
DLLEXPORT int STDCALL FPDFFont_GetAscent (FPDF_FONT font)
DLLEXPORT int STDCALL FPDFFont_GetDescent (FPDF_FONT font)
DLLEXPORT FPDF_BYTESTRING STDCALL FPDFFont_GetName (FPDF_FONT font)
DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos (FPDF_TEXTPAGE text_page, double x, double y, double xTorelance, double yTolerance)
DLLEXPORT int STDCALL FPDFText_GetCharIndexByDirection (FPDF_TEXTPAGE text_page, int index, int direction)
DLLEXPORT int STDCALL FPDFText_GetText (FPDF_TEXTPAGE text_page, int start_index, int count, unsigned short *result)
DLLEXPORT int STDCALL FPDFText_CountRects (FPDF_TEXTPAGE text_page, int start_index, int count)
DLLEXPORT void STDCALL FPDFText_GetRect (FPDF_TEXTPAGE text_page, int rect_index, double *left, double *top, double *right, double *bottom)
DLLEXPORT int STDCALL FPDFText_GetBoundedText (FPDF_TEXTPAGE text_page, double left, double top, double right, double bottom, unsigned short *buffer, int buflen)
DLLEXPORT int STDCALL FPDFText_CountBoundedSegments (FPDF_TEXTPAGE text_page, double left, double top, double right, double bottom)
DLLEXPORT void STDCALL FPDFText_GetBoundedSegment (FPDF_TEXTPAGE text_page, int seg_index, int *start_index, int *count)
DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart (FPDF_TEXTPAGE text_page, FPDF_WIDESTRING findwhat, unsigned long flags, int start_index)
DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext (FPDF_SCHHANDLE handle)
DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev (FPDF_SCHHANDLE handle)
DLLEXPORT int STDCALL FPDFText_GetSchResultIndex (FPDF_SCHHANDLE handle)
DLLEXPORT int STDCALL FPDFText_GetSchCount (FPDF_SCHHANDLE handle)
DLLEXPORT void STDCALL FPDFText_FindClose (FPDF_SCHHANDLE handle)
DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks (FPDF_TEXTPAGE text_page)
DLLEXPORT int STDCALL FPDFLink_CountWebLinks (FPDF_PAGELINK link_page)
DLLEXPORT int STDCALL FPDFLink_GetURL (FPDF_PAGELINK link_page, int link_index, unsigned short *buffer, int buflen)
DLLEXPORT int STDCALL FPDFLink_CountRects (FPDF_PAGELINK link_page, int link_index)
DLLEXPORT void STDCALL FPDFLink_GetRect (FPDF_PAGELINK link_page, int link_index, int rect_index, double *left, double *top, double *right, double *bottom)
DLLEXPORT void STDCALL FPDFLink_CloseWebLinks (FPDF_PAGELINK link_page)
DLLEXPORT FPDF_BOOL STDCALL FPDFText_PDFToText (const char *sour_file, const char *dest_file, int flag, FPDF_BYTESTRING password)
DLLEXPORT int STDCALL FPDFText_PageToText (FPDF_DOCUMENT doc, int page_index, wchar_t *buf, int size, int flag)
 _FPDFTEXT_H_

Flags used by FPDFText_GetCharIndexByDirection function.

#define FPDFTEXT_LEFT   -1
 LEFT.
#define FPDFTEXT_RIGHT   1
 RIGHT.
#define FPDFTEXT_UP   -2
 UP.
#define FPDFTEXT_DOWN   2
 DOWN.

Flags used by FPDFText_FindStart function.

#define FPDF_MATCHCASE   0x00000001
 If not set, it will not match case by default.
#define FPDF_MATCHWHOLEWORD   0x00000002
 If not set, it will not match the whole word by default.

Flags used by FPDFText_PDFToText and FPDFText_PageToText.

#define FPDFTEXT_STREAM_ORDER   0
 STREAM.
#define FPDFTEXT_DISPLAY_ORDER   1
 DISPLAY.

Detailed Description

Header file for the text module


Function Documentation

DLLEXPORT int STDCALL FPDFFont_GetAscent ( FPDF_FONT  font)

Get font ascent(in 1/1000 em).

Parameters:
[in]font- Handle to a font. Returned by FPDFText_GetFont function.
Return values:
Theascent (typically the above-baseline height of letter "h"), measured in 1/1000 of em size. So if a character uses a font size (em size) of 10 points, and it has an ascent value of 500 (meaning half of the em), then the ascent height will be 5 points (5/72 inch).
DLLEXPORT int STDCALL FPDFFont_GetDescent ( FPDF_FONT  font)

Get font descent (in 1/1000 em).

Parameters:
[in]font- Handle to a font. Returned by FPDFText_GetFont function.
Return values:
Thedescent (typically the under-baseline height of letter "g"), measured in 1/1000 of em size. Most fonts have a negative descent value.
DLLEXPORT FPDF_BYTESTRING STDCALL FPDFFont_GetName ( FPDF_FONT  font)

Get the Name of a font.

Parameters:
[in]font- Handle to a font. Returned by FPDFText_GetFont function.
Return values:
Apointer to a null-terminated string that specifies the name of the font. Application can't modify the returned string.
DLLEXPORT void STDCALL FPDFLink_CloseWebLinks ( FPDF_PAGELINK  link_page)

Release resources used by weblink feature.

Parameters:
[in]link_page- Handle returned by FPDFLink_LoadWebLinks.
Return values:
None.
DLLEXPORT int STDCALL FPDFLink_CountRects ( FPDF_PAGELINK  link_page,
int  link_index 
)

Count number of rectangular areas for the link.

Parameters:
[in]link_page- Handle returned by FPDFLink_LoadWebLinks.
[in]link_index- Zero-based index for the link.
Return values:
Numberof rectangular areas for the link.
DLLEXPORT int STDCALL FPDFLink_CountWebLinks ( FPDF_PAGELINK  link_page)

Count number of detected web links.

Parameters:
[in]link_page- Handle returned by FPDFLink_LoadWebLinks.
Return values:
Numberof detected web links.
DLLEXPORT void STDCALL FPDFLink_GetRect ( FPDF_PAGELINK  link_page,
int  link_index,
int  rect_index,
double *  left,
double *  top,
double *  right,
double *  bottom 
)

Fetch the boundaries of a rectangle for a link.

Parameters:
[in]link_page- Handle returned by FPDFLink_LoadWebLinks.
[in]link_index- Zero-based index for the link.
[in]rect_index- Zero-based index for a rectangle.
[in]left- Pointer to a double receiving the rectangle left boundary.
[in]top- Pointer to a double receiving the rectangle top boundary.
[in]right- Pointer to a double receiving the rectangle right boundary.
[in]bottom- Pointer to a double receiving the rectangle bottom boundary.
Return values:
None.
DLLEXPORT int STDCALL FPDFLink_GetURL ( FPDF_PAGELINK  link_page,
int  link_index,
unsigned short *  buffer,
int  buflen 
)

Fetch the URL information for a detected web link.

Parameters:
[in]link_page- Handle returned by FPDFLink_LoadWebLinks.
[in]link_index- Zero-based index for the link.
[in]buffer- A unicode buffer.
[in]buflen- Number of characters (not bytes) for the buffer, excluding an additional terminator.
Return values:
Ifbuffer is NULL or buflen is zero, return number of characters (not bytes) needed, otherwise, return number of characters copied into the buffer.
DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks ( FPDF_TEXTPAGE  text_page)

Prepare information about weblinks in a page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
Note:
Weblinks are those links implicitly embedded in PDF pages. PDF also has a type of annotation called "link", FPDFTEXT doesn't deal with that kind of link. FPDFTEXT weblink feature is useful for automatically detecting links in the page contents. For example, things like "http://www.foxitsoftware.com" will be detected, so applications can allow user to click on those characters to activate the link, even the PDF doesn't come with link annotations.

FPDFLink_CloseWebLinks must be called to release resources.

Return values:
Ahandle to the page's links information structure. NULL if something goes wrong.
DLLEXPORT void STDCALL FPDFText_ClosePage ( FPDF_TEXTPAGE  text_page)

Release all resources allocated for a text page information structure.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
Return values:
None.
DLLEXPORT int STDCALL FPDFText_CountBoundedSegments ( FPDF_TEXTPAGE  text_page,
double  left,
double  top,
double  right,
double  bottom 
)

Get number of text segments within a rectangular boundary on the page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]left- Left boundary.
[in]top- Top boundary.
[in]right- Right boundary.
[in]bottom- Bottom boundary.
Return values:
Numberof segments.
DLLEXPORT int STDCALL FPDFText_CountChars ( FPDF_TEXTPAGE  text_page)

Get number of characters in a page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
Note:
Characters in a page form a "stream". Inside the stream, each character has an index. These index parameters are used in FPDFText_xxx functions. The first character in the page has an index value of zero. Generated characters, like additional space characters, new line characters, are also counted.
Return values:
Numberof characters in the page. Return -1 for error.
DLLEXPORT int STDCALL FPDFText_CountRects ( FPDF_TEXTPAGE  text_page,
int  start_index,
int  count 
)

Count number of rectangular areas occupied by a segment of texts.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]start_index- Index for the start characters.
[in]count- Number of characters.
Note:
This function, along with FPDFText_GetRect can be used by applications to detect the position on the page for a text segment, so proper areas can be highlighted or something. FPDFTEXT will automatically merge small character boxes into bigger one if those characters are on the same line and use same font settings.
Return values:
Numberof rectangles. Zero for error.
DLLEXPORT void STDCALL FPDFText_FindClose ( FPDF_SCHHANDLE  handle)

Release a search context.

Parameters:
[in]handle- A search context handle returned by FPDFText_FindStart.
Return values:
None.
DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext ( FPDF_SCHHANDLE  handle)

Search in the direction from page start to end.

Parameters:
[in]handle- A search context handle returned by FPDFText_FindStart.
Return values:
Whethera match is found.
DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev ( FPDF_SCHHANDLE  handle)

Search in the direction from page end to start.

Parameters:
[in]handle- A search context handle returned by FPDFText_FindStart.
Return values:
Whethera match is found.
DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart ( FPDF_TEXTPAGE  text_page,
FPDF_WIDESTRING  findwhat,
unsigned long  flags,
int  start_index 
)

Start a search.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]findwhat- A unicode match pattern.
[in]flags- Option flags.
[in]start_index- Start from this character. -1 for end of the page.
Return values:
Ahandle for the search context. FPDFText_FindClose must be called to release this handle.
DLLEXPORT void STDCALL FPDFText_GetBoundedSegment ( FPDF_TEXTPAGE  text_page,
int  seg_index,
int *  start_index,
int *  count 
)

Get a particular segment in the result generated by FPDFText_CountBoundedSegments function.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]seg_index- Zero-based index for the segment.
[in]start_index- Pointer to an integer receiving the start character index for the segment.
[in]count- Pointer to an integer receiving number of characters in the segment.
Return values:
None.
DLLEXPORT int STDCALL FPDFText_GetBoundedText ( FPDF_TEXTPAGE  text_page,
double  left,
double  top,
double  right,
double  bottom,
unsigned short *  buffer,
int  buflen 
)

Extract unicode text within a rectangular boundary on the page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]left- Left boundary.
[in]top- Top boundary.
[in]right- Right boundary.
[in]bottom- Bottom boundary.
[in]buffer- A unicode buffer.
[in]buflen- Number of characters (not bytes) for the buffer, excluding an additional terminator.
Return values:
Ifbuffer is NULL or buflen is zero, return number of characters (not bytes) needed, otherwise, return number of characters copied into the buffer.
DLLEXPORT void STDCALL FPDFText_GetCharBox ( FPDF_TEXTPAGE  text_page,
int  index,
double *  left,
double *  right,
double *  bottom,
double *  top 
)

Get bounding box of a particular character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
[in]left- Pointer to a double receiving left position of the character box.
[in]right- Pointer to a double receiving right position of the character box.
[in]bottom- Pointer to a double receiving bottom position of the character box.
[in]top- Pointer to a double receiving top position of the character box.
Note:
All positions are measured in PDF "user space".
Return values:
None.
DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos ( FPDF_TEXTPAGE  text_page,
double  x,
double  y,
double  xTorelance,
double  yTolerance 
)

Get the index of a character at or nearby a certain position on the page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]x- X position in PDF "user space".
[in]y- Y position in PDF "user space".
[in]xTolerance- A x-axis tolerance value for character hit detection, in point unit.
[in]yTolerance- A y-axis tolerance value for character hit detection, in point unit.
Return values:
Thezero-based index of the character at, or nearby the point (x,y). If there is no character at or nearby the point, return value will be -1. If an error occurs, return value will be -3.
DLLEXPORT int STDCALL FPDFText_GetCharIndexByDirection ( FPDF_TEXTPAGE  text_page,
int  index,
int  direction 
)

Move the character index in different directions and get new character index, from a specific character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index for the current character.
[in]direction- A number indicating the moving direction. Can be one of the followings: FPDFTEXT_LEFT, FPDFTEXT_UP, FPDFTEXT_RIGHT, FPDFTEXT_DOWN.
Note:
FPDFTEXT moves the character pointer according to "stream order". For example, left will move to the previous character, right will move to next character. Because in PDF, "stream order" can be different from "appearance order" (the order that appears to human eyes), so it's possible the moving direction doesn't match the actually position movement.
Return values:
Zero-basecharacter index for the new position. -1 if beginning of the page reached; -2 if end of the page reached; -3 for failures.
DLLEXPORT FPDF_FONT STDCALL FPDFText_GetFont ( FPDF_TEXTPAGE  text_page,
int  index 
)

Get font of a particular character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
Return values:
Ahandle to the font used by the particular character. This handle can be used in FPDFFont_xxx functions for more information about the font.
DLLEXPORT double STDCALL FPDFText_GetFontSize ( FPDF_TEXTPAGE  text_page,
int  index 
)

Get the font size of a particular character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
Return values:
Thefont size of the particular character, measured in points (about 1/72 inch). This is the typographic size of the font (so called "em size").
DLLEXPORT void STDCALL FPDFText_GetMatrix ( FPDF_TEXTPAGE  text_page,
int  index,
double *  a,
double *  b,
double *  c,
double *  d 
)

Get the matrix of a particular character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
[in]a- Pointer to a double receiving the coefficient "a" of the matrix.
[in]b- Pointer to a double receiving the coefficient "b" of the matrix.
[in]c- Pointer to a double receiving the coefficient "c" of the matrix.
[in]d- Pointer to a double receiving the coefficient "d" of the matrix.
Note:
A matrix defines coodrinate transformation from one coordinate space to another. In PDF, a matrix is defined by the following equations: x' = a * x + c * y + e; y' = b * x + d * y + f; FPDFText_GetMatrix function is used to get a,b,c,d coefficients of the transformation from "text space" to "user space". The e,f coefficients are actually the origin position, which can be fetched by FPDFText_GetOrigin function.
Return values:
None.
DLLEXPORT void STDCALL FPDFText_GetOrigin ( FPDF_TEXTPAGE  text_page,
int  index,
double *  x,
double *  y 
)

Get origin position of a particular character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
[in]x- Pointer to a double receiving X position of the character origin.
[in]y- Pointer to a double number receiving Y position of the character origin.
Note:
Origin X/Y positions are measured in PDF "user space".
Return values:
None.
DLLEXPORT void STDCALL FPDFText_GetRect ( FPDF_TEXTPAGE  text_page,
int  rect_index,
double *  left,
double *  top,
double *  right,
double *  bottom 
)

Get a rectangular area from the result generated by FPDFText_CountRects.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]rect_index- Zero-based index for the rectangle.
[in]left- Pointer to a double receiving the rectangle left boundary.
[in]top- Pointer to a double receiving the rectangle top boundary.
[in]right- Pointer to a double receiving the rectangle right boundary.
[in]bottom- Pointer to a double receiving the rectangle bottom boundary.
Return values:
None.
DLLEXPORT int STDCALL FPDFText_GetSchCount ( FPDF_SCHHANDLE  handle)

Get the number of matched characters in the search result.

Parameters:
[in]handle- A search context handle returned by FPDFText_FindStart.
Return values:
Numberof matched characters.
DLLEXPORT int STDCALL FPDFText_GetSchResultIndex ( FPDF_SCHHANDLE  handle)

Get the starting character index of the search result.

Parameters:
[in]handle- A search context handle returned by FPDFText_FindStart.
Return values:
Indexfor the starting character.
DLLEXPORT int STDCALL FPDFText_GetText ( FPDF_TEXTPAGE  text_page,
int  start_index,
int  count,
unsigned short *  result 
)

Extract unicode text string from the page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]start_index- Index for the start character.
[in]count- Number of characters to be extracted.
[in]result- A buffer (allocated by application) receiving the extracted unicodes. The size of the buffer must be able to hold the number of characters plus a terminator.
Note:
This function ignores characters without unicode information.
Return values:
Numberof characters written into the result buffer, excluding the trailing terminator.
DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode ( FPDF_TEXTPAGE  text_page,
int  index 
)

Get Unicode of a character in a page.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
Return values:
TheUnicode of the character. If a character is not encoded in Unicode and cannot be convert to Unicode by the Foxit engine, the return value will be zero.
DLLEXPORT FPDF_BOOL STDCALL FPDFText_IsGenerated ( FPDF_TEXTPAGE  text_page,
int  index 
)

Indicate whether a character is a generated character.

Parameters:
[in]text_page- Handle to a text page information structure. Returned by FPDFText_LoadPage function.
[in]index- Zero-based index of the character.
Note:
"Generated character" is a character generated by FPDFTEXT engine to keep formatting information. It is not actually encoded in the PDF page. This happens in two cases: 1) an extra space character will be generated if two characters in the same line appears to be apart by quite some space, 2) a new line character will be generated if two consecutive characters appears to be on different line. These characters are useful when doing the search.
Return values:
TRUEindicates a generated character and FALSE indicates an actual character in the PDF page.
DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage ( FPDF_PAGE  page)

Prepare information about all characters in a page.

Parameters:
[in]page- Handle to the page. Returned by FPDF_LoadPage function (in view module).
Return values:
Ahandle to the text page information structure. NULL if something goes wrong.
Note:
Application must call FPDFText_ClosePage to release the text page information. If the Text Module is not unlocked, this function will return NULL.
DLLEXPORT int STDCALL FPDFText_PageToText ( FPDF_DOCUMENT  doc,
int  page_index,
wchar_t *  buf,
int  size,
int  flag 
)

_FPDFTEXT_H_

Convert a PDF page data to a text buffer.

Parameters:
[in]doc- Handle to document. Returned by FPDF_LoadDocument function.
[in]page_index- Index number of the page. 0 for the first page.
[in]buf- An output buffer used to hold the text of the page.
[in]size- Size of the buffer.
[in]flag- 0 for stream order, 1 for appearance order.
Return values:
Ifbuf is NULL or size is zero, number of characters (not bytes) needed, otherwise, number of characters copied into the buf.
DLLEXPORT FPDF_BOOL STDCALL FPDFText_PDFToText ( const char *  sour_file,
const char *  dest_file,
int  flag,
FPDF_BYTESTRING  password 
)

Convert a PDF file to a TXT File.

Parameters:
[in]sour_file- Path to the PDF file you want to Convert.
[in]dest_file- The path of the file you want to save.
[in]flag- 0 for stream order, 1 for appearance order.
[in]password- A string used as the password for PDF file. If no password needed, empty or NULL can be used.
Return values:
TUREfor succeed, False for failed.