Documentation
¶
Overview ¶
Package tabula provides a fluent API for extracting text, tables, and other content from PDF, DOCX, ODT, XLSX, PPTX, and HTML files.
Basic usage:
text, warnings, err := tabula.Open("document.pdf").Text()
if err != nil {
// handle error
}
if len(warnings) > 0 {
log.Println("Warnings:", tabula.FormatWarnings(warnings))
}
DOCX files work the same way:
text, warnings, err := tabula.Open("document.docx").Text()
With options:
text, _, err := tabula.Open("report.pdf").
Pages(1, 2, 3).
ExcludeHeaders().
ExcludeFooters().
Text()
HTML content can be parsed from a string (useful for web scraping):
text, _, err := tabula.FromHTMLString(htmlContent).Text()
For advanced use cases, the lower-level reader package is also available.
Example (ChunkFiltering) ¶
package main
import (
"github.com/tsawler/tabula"
)
func main() {
chunks, _, _ := tabula.Open("doc.pdf").Chunks()
// Filter by content type
tablesOnly := chunks.FilterWithTables()
listsOnly := chunks.FilterWithLists()
_ = tablesOnly
_ = listsOnly
// Filter by location
section := chunks.FilterBySection("Introduction")
page5 := chunks.FilterByPage(5)
pages1to10 := chunks.FilterByPageRange(1, 10)
_ = section
_ = page5
_ = pages1to10
// Filter by size
smallChunks := chunks.FilterByMaxTokens(500)
largeChunks := chunks.FilterByMinTokens(100)
_ = smallChunks
_ = largeChunks
// Search
matches := chunks.Search("keyword")
_ = matches
// Chain filters
result := chunks.
FilterBySection("Methods").
FilterByMinTokens(100).
Search("algorithm")
_ = result
}
Example (ChunkMetadata) ¶
package main
import (
"fmt"
"github.com/tsawler/tabula"
)
func main() {
chunks, _, _ := tabula.Open("doc.pdf").Chunks()
for _, chunk := range chunks.Chunks {
fmt.Println("ID:", chunk.ID)
fmt.Println("Section:", chunk.Metadata.SectionTitle)
fmt.Println("Pages:", chunk.Metadata.PageStart, "-", chunk.Metadata.PageEnd)
fmt.Println("Words:", chunk.Metadata.WordCount)
fmt.Println("Tokens:", chunk.Metadata.EstimatedTokens)
fmt.Println("Has Table:", chunk.Metadata.HasTable)
fmt.Println("Has List:", chunk.Metadata.HasList)
}
}
Example (ChunksAsMarkdown) ¶
package main
import (
"log"
"github.com/tsawler/tabula"
)
func main() {
chunks, _, err := tabula.Open("document.pdf").
ExcludeHeadersAndFooters().
Chunks()
if err != nil {
log.Fatal(err)
}
// Get each chunk as separate markdown strings
mdChunks := chunks.ToMarkdownChunks()
for i, md := range mdChunks {
// Example: store each chunk in your vector database
_ = chunks.Chunks[i].ID
_ = md
}
}
Example (CollectionStatistics) ¶
package main
import (
"fmt"
"github.com/tsawler/tabula"
)
func main() {
chunks, _, _ := tabula.Open("doc.pdf").Chunks()
stats := chunks.Statistics()
fmt.Println("Total chunks:", stats.TotalChunks)
fmt.Println("Total words:", stats.TotalWords)
fmt.Println("Average tokens:", stats.AvgTokens)
fmt.Println("Chunks with tables:", stats.ChunksWithTables)
}
Example (CustomChunkSizing) ¶
package main
import (
"github.com/tsawler/tabula"
"github.com/tsawler/tabula/rag"
)
func main() {
config := rag.ChunkerConfig{
TargetChunkSize: 500, // Target characters per chunk
MaxChunkSize: 1000, // Maximum characters
MinChunkSize: 100, // Minimum characters
OverlapSize: 50, // Overlap between chunks
}
sizeConfig := rag.DefaultSizeConfig()
chunks, _, _ := tabula.Open("doc.pdf").ChunksWithConfig(config, sizeConfig)
_ = chunks
}
Example (ErrorHandling) ¶
package main
import (
"github.com/tsawler/tabula"
)
func main() {
// Panic on error (for scripts/tests)
text := tabula.MustText(tabula.Open("doc.pdf").Text())
count := tabula.Must(tabula.Open("doc.pdf").PageCount())
_ = text
_ = count
}
Example (ExtractMarkdown) ¶
package main
import (
"github.com/tsawler/tabula"
)
func main() {
// PDF with header/footer exclusion
markdown, warnings, err := tabula.Open("document.pdf").
ExcludeHeadersAndFooters().
ToMarkdown()
_ = markdown
_ = warnings
_ = err
// DOCX (preserves headings, lists, tables)
markdown, warnings, err = tabula.Open("document.docx").ToMarkdown()
_ = markdown
_ = warnings
_ = err
}
Example (ExtractText) ¶
package main
import (
"fmt"
"log"
"github.com/tsawler/tabula"
)
func main() {
// Works with both PDF and DOCX files
text, warnings, err := tabula.Open("document.pdf").Text()
// text, warnings, err := tabula.Open("document.docx").Text()
if err != nil {
log.Fatal(err)
}
fmt.Println(text)
for _, w := range warnings {
fmt.Println("Warning:", w.Message)
}
}
Example (ExtractWithOptions) ¶
package main
import (
"github.com/tsawler/tabula"
)
func main() {
text, warnings, err := tabula.Open("document.pdf").
Pages(1, 2, 3). // Specific pages (PDF only)
ExcludeHeadersAndFooters(). // Remove repeating headers/footers (PDF only)
JoinParagraphs(). // Join text into paragraphs (PDF only)
Text()
_ = text
_ = warnings
_ = err
}
Example (InspectionMethods) ¶
package main
import (
"github.com/tsawler/tabula"
)
func main() {
ext := tabula.Open("document.pdf")
defer ext.Close()
isCharLevel, _ := ext.IsCharacterLevel() // Detect character-level PDFs
isMultiCol, _ := ext.IsMultiColumn() // Detect multi-column layouts
pageCount, _ := ext.PageCount() // Get page count (works with DOCX too)
_ = isCharLevel
_ = isMultiCol
_ = pageCount
}
Example (MarkdownOptions) ¶
package main
import (
"github.com/tsawler/tabula"
"github.com/tsawler/tabula/rag"
)
func main() {
opts := rag.MarkdownOptions{
IncludeMetadata: true, // YAML front matter
IncludeTableOfContents: true, // Generated TOC
IncludeChunkSeparators: true, // --- between chunks
IncludePageNumbers: true, // Page references
IncludeChunkIDs: true, // HTML comments with chunk IDs
}
markdown, _, _ := tabula.Open("doc.pdf").ToMarkdownWithOptions(opts)
_ = markdown
// Or use preset for RAG
opts = rag.RAGOptimizedMarkdownOptions()
_ = opts
}
Example (OpenDocuments) ¶
package main
import (
"github.com/tsawler/tabula"
"github.com/tsawler/tabula/reader"
)
func main() {
// From file path (format auto-detected by extension)
ext := tabula.Open("document.pdf")
_ = ext
ext = tabula.Open("document.docx")
_ = ext
// From existing PDF reader (PDF only)
r, _ := reader.Open("document.pdf")
ext = tabula.FromReader(r)
_ = ext
}
Example (RagChunking) ¶
package main
import (
"fmt"
"log"
"github.com/tsawler/tabula"
)
func main() {
// Works with both PDF and DOCX
chunks, warnings, err := tabula.Open("document.pdf").Chunks()
// chunks, warnings, err := tabula.Open("document.docx").Chunks()
if err != nil {
log.Fatal(err)
}
for i, chunk := range chunks.Chunks {
fmt.Printf("Chunk %d: %s (p.%d-%d, ~%d tokens)\n",
i+1,
chunk.Metadata.SectionTitle,
chunk.Metadata.PageStart,
chunk.Metadata.PageEnd,
chunk.Metadata.EstimatedTokens)
fmt.Println(chunk.Text)
fmt.Println("---")
}
// Warnings are non-fatal issues
for _, w := range warnings {
fmt.Println("Warning:", w.Message)
}
}
Example (Warnings) ¶
package main
import (
"log"
"github.com/tsawler/tabula"
)
func main() {
text, warnings, err := tabula.Open("document.pdf").Text()
if err != nil {
log.Fatal(err) // Fatal error
}
_ = text
for _, w := range warnings {
log.Println("Warning:", w.Message) // Non-fatal issues
}
// Format all warnings
formatted := tabula.FormatWarnings(warnings)
_ = formatted
}
Index ¶
- func FormatWarnings(warnings []Warning) string
- func Must[T any](val T, err error) T
- func MustText[T any](val T, _ []Warning, err error) T
- type ExtractOptions
- type Extractor
- func (e *Extractor) Analyze() (*layout.AnalysisResult, error)
- func (e *Extractor) Blocks() ([]layout.Block, error)
- func (e *Extractor) ByColumn() *Extractor
- func (e *Extractor) Chunks() (*rag.ChunkCollection, []Warning, error)
- func (e *Extractor) ChunksWithConfig(config rag.ChunkerConfig, sizeConfig rag.SizeConfig) (*rag.ChunkCollection, []Warning, error)
- func (e *Extractor) Close() error
- func (e *Extractor) Document() (*model.Document, []Warning, error)
- func (e *Extractor) Elements() ([]layout.LayoutElement, error)
- func (e *Extractor) ExcludeFooters() *Extractor
- func (e *Extractor) ExcludeHeaders() *Extractor
- func (e *Extractor) ExcludeHeadersAndFooters() *Extractor
- func (e *Extractor) Fragments() ([]text.TextFragment, []Warning, error)
- func (e *Extractor) Headings() ([]layout.Heading, error)
- func (e *Extractor) IsCharacterLevel() (bool, error)
- func (e *Extractor) IsMultiColumn() (bool, error)
- func (e *Extractor) JoinParagraphs() *Extractor
- func (e *Extractor) Lines() ([]layout.Line, error)
- func (e *Extractor) Lists() ([]layout.List, error)
- func (e *Extractor) PageCount() (int, error)
- func (e *Extractor) PageRange(start, end int) *Extractor
- func (e *Extractor) Pages(pages ...int) *Extractor
- func (e *Extractor) Paragraphs() ([]layout.Paragraph, error)
- func (e *Extractor) PreserveLayout() *Extractor
- func (e *Extractor) ReadingOrder() (*layout.ReadingOrderResult, error)
- func (e *Extractor) Text() (string, []Warning, error)
- func (e *Extractor) ToMarkdown() (string, []Warning, error)
- func (e *Extractor) ToMarkdownWithOptions(opts rag.MarkdownOptions) (string, []Warning, error)
- type Warning
- type WarningCode
Examples ¶
- Package (ChunkFiltering)
- Package (ChunkMetadata)
- Package (ChunksAsMarkdown)
- Package (CollectionStatistics)
- Package (CustomChunkSizing)
- Package (ErrorHandling)
- Package (ExtractMarkdown)
- Package (ExtractText)
- Package (ExtractWithOptions)
- Package (InspectionMethods)
- Package (MarkdownOptions)
- Package (OpenDocuments)
- Package (RagChunking)
- Package (Warnings)
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func FormatWarnings ¶
FormatWarnings returns a human-readable string of all warnings. Returns empty string if there are no warnings.
func Must ¶
Must is a helper that wraps a call to a function returning (T, error) and panics if the error is non-nil. It is intended for use in scripts or tests where error handling would be cumbersome.
Example:
count := tabula.Must(tabula.Open("document.pdf").PageCount())
func MustText ¶
MustText is a helper that wraps a call to Text() or Fragments() and panics if the error is non-nil. It discards warnings and returns just the value. It is intended for use in scripts or tests where error handling would be cumbersome.
Example:
text := tabula.MustText(tabula.Open("document.pdf").Text())
Types ¶
type ExtractOptions ¶
type ExtractOptions struct {
// contains filtered or unexported fields
}
ExtractOptions holds configuration for text extraction.
type Extractor ¶
type Extractor struct {
// contains filtered or unexported fields
}
Extractor provides a fluent interface for extracting content from PDFs, DOCX, ODT, XLSX, PPTX, HTML, and EPUB files. Each configuration method returns a new Extractor instance, making it safe for concurrent use and allowing method chaining.
func FromHTMLReader ¶ added in v1.4.0
FromHTMLReader creates an Extractor from an io.Reader containing HTML content. This is useful when you have HTML content that was fetched from a remote source (e.g., via HTTP) and want to extract text or convert it to markdown without saving it to a file first.
Example:
resp, err := http.Get("https://example.com/page")
if err != nil {
// handle error
}
defer resp.Body.Close()
text, warnings, err := tabula.FromHTMLReader(resp.Body).Text()
func FromHTMLString ¶ added in v1.4.0
FromHTMLString creates an Extractor from a string containing HTML content. This is useful when you have HTML content as a string (e.g., fetched from a web API or embedded in your application) and want to extract text or convert it to markdown.
Example:
html := `<html><body><h1>Hello</h1><p>World</p></body></html>` text, warnings, err := tabula.FromHTMLString(html).Text()
For web scraping:
resp, err := http.Get("https://example.com/page")
if err != nil {
// handle error
}
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
text, _, _ := tabula.FromHTMLString(string(body)).Text()
func FromReader ¶
FromReader creates an Extractor from an already-opened PDF reader.Reader. This is useful when you need more control over the PDF reader lifecycle. Note: The caller is responsible for closing the reader. For DOCX files, use Open() instead which handles format detection automatically.
Example:
r, err := reader.Open("document.pdf")
if err != nil {
// handle error
}
defer r.Close()
text, warnings, err := tabula.FromReader(r).Text()
func Open ¶
Open opens a PDF or DOCX file and returns an Extractor for fluent configuration. The file format is automatically detected based on the file extension. The returned Extractor must be closed when done, either explicitly via Close() or implicitly when calling a terminal operation like Text().
Supported formats:
- PDF (.pdf)
- DOCX (.docx)
Example:
text, warnings, err := tabula.Open("document.pdf").Text()
text, warnings, err := tabula.Open("document.docx").Text()
func (*Extractor) Analyze ¶
func (e *Extractor) Analyze() (*layout.AnalysisResult, error)
Analyze performs complete layout analysis and returns all detected elements. This is the most comprehensive extraction method, combining columns, lines, paragraphs, headings, lists, and reading order into a unified result. This is a terminal operation that closes the underlying reader.
Example:
result, err := tabula.Open("document.pdf").Pages(1).Analyze()
for _, elem := range result.Elements {
fmt.Printf("[%s] %s\n", elem.Type, elem.Text)
}
func (*Extractor) Blocks ¶
Blocks extracts and returns detected text blocks from the document. Blocks are spatially grouped regions of text, useful for understanding document layout structure. This is a terminal operation that closes the underlying reader.
Example:
blocks, err := tabula.Open("document.pdf").Blocks()
for _, block := range blocks {
fmt.Printf("Block at (%.1f, %.1f): %s\n", block.BBox.X, block.BBox.Y, block.GetText())
}
func (*Extractor) ByColumn ¶
ByColumn configures the extractor to process text column by column in reading order, rather than line by line across the full page width. This is useful for multi-column documents like newspapers or academic papers.
Example:
text, _, err := tabula.Open("newspaper.pdf").ByColumn().Text()
func (*Extractor) Chunks ¶
func (e *Extractor) Chunks() (*rag.ChunkCollection, []Warning, error)
Chunks extracts content and returns semantic chunks for RAG workflows. This method combines document extraction with RAG chunking in a single call. This is a terminal operation that closes the underlying reader.
Example:
chunks, warnings, err := tabula.Open("document.pdf").
ExcludeHeadersAndFooters().
Chunks()
if err != nil {
log.Fatal(err)
}
for _, chunk := range chunks.Chunks {
fmt.Printf("[%s] %s\n", chunk.Metadata.SectionTitle, chunk.Text[:50])
}
func (*Extractor) ChunksWithConfig ¶
func (e *Extractor) ChunksWithConfig(config rag.ChunkerConfig, sizeConfig rag.SizeConfig) (*rag.ChunkCollection, []Warning, error)
ChunksWithConfig extracts content and returns semantic chunks using custom configuration. This allows fine-tuning of chunk sizes, overlap, and other parameters. This is a terminal operation that closes the underlying reader.
Example:
config := rag.ChunkerConfig{
TargetChunkSize: 500,
MaxChunkSize: 1000,
OverlapSize: 50,
}
sizeConfig := rag.DefaultSizeConfig()
chunks, warnings, err := tabula.Open("document.pdf").
ExcludeHeadersAndFooters().
ChunksWithConfig(config, sizeConfig)
func (*Extractor) Close ¶
Close releases resources associated with the Extractor. It is safe to call Close multiple times.
func (*Extractor) Document ¶
Document extracts content and returns a model.Document structure suitable for RAG chunking and other document processing workflows. This is a terminal operation that closes the underlying reader.
Example:
doc, warnings, err := tabula.Open("document.pdf").
ExcludeHeadersAndFooters().
Document()
doc, warnings, err := tabula.Open("document.docx").Document()
if err != nil {
log.Fatal(err)
}
// Use doc for chunking or other processing
func (*Extractor) Elements ¶
func (e *Extractor) Elements() ([]layout.LayoutElement, error)
Elements extracts and returns all detected elements in reading order. Elements include paragraphs, headings, and lists, unified into a single ordered list. This is useful for document reconstruction or RAG workflows. This is a terminal operation that closes the underlying reader.
Example:
elements, err := tabula.Open("document.pdf").Elements()
for _, elem := range elements {
fmt.Printf("[%s] %s\n", elem.Type, elem.Text)
}
func (*Extractor) ExcludeFooters ¶
ExcludeFooters configures the extractor to exclude detected footers.
Example:
text, _, err := tabula.Open("doc.pdf").ExcludeFooters().Text()
func (*Extractor) ExcludeHeaders ¶
ExcludeHeaders configures the extractor to exclude detected headers.
Example:
text, _, err := tabula.Open("doc.pdf").ExcludeHeaders().Text()
func (*Extractor) ExcludeHeadersAndFooters ¶
ExcludeHeadersAndFooters configures the extractor to exclude both detected headers and footers. This is a convenience method equivalent to calling ExcludeHeaders().ExcludeFooters().
Example:
text, _, err := tabula.Open("doc.pdf").ExcludeHeadersAndFooters().Text()
func (*Extractor) Fragments ¶
func (e *Extractor) Fragments() ([]text.TextFragment, []Warning, error)
Fragments extracts and returns text fragments with position information. This is a terminal operation that closes the underlying reader.
Returns the fragments, any warnings encountered during processing, and an error if extraction failed.
Example:
fragments, warnings, err := tabula.Open("document.pdf").Pages(1).Fragments()
func (*Extractor) Headings ¶
Headings extracts and returns detected headings (H1-H6) from the document. This is a terminal operation that closes the underlying reader.
Example:
headings, err := tabula.Open("document.pdf").Headings()
for _, h := range headings {
fmt.Printf("[%s] %s\n", h.Level, h.Text)
}
func (*Extractor) IsCharacterLevel ¶
IsCharacterLevel checks if the first page of the PDF uses character-level text fragments (one character per fragment). This requires special handling for proper text extraction. Note: This reads page 1 to make the determination. The reader remains open.
Example:
ext := tabula.Open("document.pdf")
defer ext.Close()
isCharLevel, err := ext.IsCharacterLevel()
func (*Extractor) IsMultiColumn ¶
IsMultiColumn checks if the first page of the PDF appears to have a multi-column layout. Note: This reads page 1 to make the determination. The reader remains open.
Example:
ext := tabula.Open("newspaper.pdf")
defer ext.Close()
multiCol, err := ext.IsMultiColumn()
func (*Extractor) JoinParagraphs ¶
JoinParagraphs configures the extractor to join lines within paragraphs using spaces instead of newlines. This produces cleaner text output where paragraph breaks are preserved but soft line breaks within paragraphs are removed.
Example:
text, _, err := tabula.Open("doc.pdf").JoinParagraphs().Text()
text, _, err := tabula.Open("doc.pdf").ExcludeHeadersAndFooters().JoinParagraphs().Text()
func (*Extractor) Lines ¶
Lines extracts and returns detected text lines with position and alignment info. This is a terminal operation that closes the underlying reader.
Example:
lines, err := tabula.Open("document.pdf").Lines()
for _, line := range lines {
fmt.Printf("%s (align: %s)\n", line.Text, line.Alignment)
}
func (*Extractor) Lists ¶
Lists extracts and returns detected lists (bulleted, numbered, etc.) from the document. This is a terminal operation that closes the underlying reader.
Example:
lists, err := tabula.Open("document.pdf").Lists()
for _, list := range lists {
fmt.Printf("List type: %s, items: %d\n", list.Type, len(list.Items))
}
func (*Extractor) PageCount ¶
PageCount returns the total number of pages in the document. For DOCX files, this returns 1 (the entire document is treated as a single page). Note: This does NOT close the reader, allowing further operations.
Example:
ext := tabula.Open("document.pdf")
defer ext.Close()
count, err := ext.PageCount()
func (*Extractor) PageRange ¶
PageRange specifies a range of pages to extract (1-indexed, inclusive).
Example:
text, _, err := tabula.Open("doc.pdf").PageRange(5, 10).Text()
func (*Extractor) Pages ¶
Pages specifies which pages to extract from (1-indexed). Multiple calls are cumulative.
Example:
text, _, err := tabula.Open("doc.pdf").Pages(1, 3, 5).Text()
func (*Extractor) Paragraphs ¶
Paragraphs extracts and returns detected paragraphs with style information. This uses reading order detection to handle multi-column layouts correctly. This is a terminal operation that closes the underlying reader.
Example:
paragraphs, err := tabula.Open("document.pdf").
ExcludeHeaders().
ExcludeFooters().
Paragraphs()
for _, para := range paragraphs {
fmt.Printf("[%s] %s\n", para.Style, para.Text)
}
func (*Extractor) PreserveLayout ¶
PreserveLayout maintains spatial positioning by inserting spaces to approximate the visual layout of the original document.
Example:
text, _, err := tabula.Open("form.pdf").PreserveLayout().Text()
func (*Extractor) ReadingOrder ¶
func (e *Extractor) ReadingOrder() (*layout.ReadingOrderResult, error)
ReadingOrder extracts and returns detailed reading order analysis. This includes column detection, section boundaries, and proper text ordering for multi-column documents. This is a terminal operation that closes the underlying reader.
Example:
ro, err := tabula.Open("newspaper.pdf").Pages(1).ReadingOrder()
fmt.Printf("Columns: %d\n", ro.ColumnCount)
for _, section := range ro.Sections {
fmt.Printf("Section: %s\n", section.Type)
}
func (*Extractor) Text ¶
Text extracts and returns the text content from the configured pages. This is a terminal operation that closes the underlying reader.
Returns the extracted text, any warnings encountered during processing, and an error if extraction failed. Warnings indicate non-fatal issues (e.g., messy PDF detected) where extraction succeeded but results may be imperfect.
Example:
text, warnings, err := tabula.Open("document.pdf").Text()
text, warnings, err := tabula.Open("document.docx").Text()
if len(warnings) > 0 {
log.Println("Warnings:", tabula.FormatWarnings(warnings))
}
func (*Extractor) ToMarkdown ¶
ToMarkdown extracts content and returns it as a markdown-formatted string. This preserves document structure including headings, paragraphs, and lists. This is a terminal operation that closes the underlying reader.
Returns the markdown text, any warnings encountered during processing, and an error if extraction failed.
Example:
md, warnings, err := tabula.Open("document.pdf").
ExcludeHeadersAndFooters().
ToMarkdown()
func (*Extractor) ToMarkdownWithOptions ¶
ToMarkdownWithOptions extracts content and returns it as markdown with custom options. This is a terminal operation that closes the underlying reader.
Supported options for all formats:
- IncludeMetadata: adds YAML front matter with document metadata
- IncludeTableOfContents: generates a table of contents from headings
- HeadingLevelOffset: adjusts heading levels (e.g., 1 makes H1 -> H2)
- MaxHeadingLevel: caps heading depth (default: 6)
PDF-only options (used via RAG chunking):
- IncludeChunkSeparators: adds horizontal rules between chunks
- IncludePageNumbers: adds page references
- IncludeChunkIDs: adds chunk IDs as HTML comments
Example:
opts := rag.MarkdownOptions{
IncludeTableOfContents: true,
IncludeMetadata: true,
}
md, warnings, err := tabula.Open("document.pdf").ToMarkdownWithOptions(opts)
md, warnings, err := tabula.Open("document.docx").ToMarkdownWithOptions(opts)
type Warning ¶
type Warning struct {
Code WarningCode
Message string
}
Warning represents a non-fatal issue encountered during PDF processing. Unlike errors, warnings indicate that extraction succeeded but the results may be imperfect or require attention.
type WarningCode ¶
type WarningCode int
WarningCode identifies the type of warning encountered during PDF processing.
const ( // WarningMessyPDF indicates the PDF exhibits traits of being "messy" or // display-oriented (e.g., generated by Word, Quartz, or highly fragmented). // Text extraction may still succeed but results might have ordering issues. WarningMessyPDF WarningCode = iota // WarningOCRFallback indicates that OCR was used to extract text from // a scanned page that contained no native PDF text. This typically means // the page contains only images (e.g., a scanned document). WarningOCRFallback )
Directories
¶
| Path | Synopsis |
|---|---|
|
Package contentstream provides parsing of PDF content streams.
|
Package contentstream provides parsing of PDF content streams. |
|
Package core provides low-level PDF parsing primitives and object types.
|
Package core provides low-level PDF parsing primitives and object types. |
|
Package docx provides DOCX (Office Open XML) document parsing.
|
Package docx provides DOCX (Office Open XML) document parsing. |
|
Package epubdoc provides EPUB document parsing.
|
Package epubdoc provides EPUB document parsing. |
|
Package font provides PDF font handling including Type1, TrueType, and CID fonts.
|
Package font provides PDF font handling including Type1, TrueType, and CID fonts. |
|
Package format provides file format detection for the tabula library.
|
Package format provides file format detection for the tabula library. |
|
Package graphicsstate provides PDF graphics state management.
|
Package graphicsstate provides PDF graphics state management. |
|
Package htmldoc provides HTML document parsing.
|
Package htmldoc provides HTML document parsing. |
|
internal
|
|
|
filters
Package filters provides PDF stream decompression filters.
|
Package filters provides PDF stream decompression filters. |
|
Package layout provides document layout analysis for extracting semantic structure from PDF pages.
|
Package layout provides document layout analysis for extracting semantic structure from PDF pages. |
|
Package model provides the intermediate representation (IR) for extracted document content.
|
Package model provides the intermediate representation (IR) for extracted document content. |
|
Package ocr provides OCR (Optical Character Recognition) capabilities for extracting text from images in scanned PDFs.
|
Package ocr provides OCR (Optical Character Recognition) capabilities for extracting text from images in scanned PDFs. |
|
Package odt provides ODT (OpenDocument Text) document parsing.
|
Package odt provides ODT (OpenDocument Text) document parsing. |
|
Package pages provides PDF page tree traversal and page access.
|
Package pages provides PDF page tree traversal and page access. |
|
Package pptx provides PPTX (Office Open XML Presentation) document parsing.
|
Package pptx provides PPTX (Office Open XML Presentation) document parsing. |
|
Package rag provides semantic chunking for RAG (Retrieval-Augmented Generation) workflows.
|
Package rag provides semantic chunking for RAG (Retrieval-Augmented Generation) workflows. |
|
Package reader provides high-level PDF file reading and object resolution.
|
Package reader provides high-level PDF file reading and object resolution. |
|
Package resolver provides PDF indirect reference resolution.
|
Package resolver provides PDF indirect reference resolution. |
|
Package tables provides table detection and extraction from PDF pages.
|
Package tables provides table detection and extraction from PDF pages. |
|
Package text provides text extraction from PDF content streams.
|
Package text provides text extraction from PDF content streams. |
|
Package xlsx provides XLSX (Office Open XML Spreadsheet) document parsing.
|
Package xlsx provides XLSX (Office Open XML Spreadsheet) document parsing. |