Documentation
¶
Index ¶
- Constants
- Variables
- func ExtractText(htmlBytes []byte) (string, error)
- func ExtractToJSON(htmlBytes []byte) ([]byte, error)
- func ExtractToMarkdown(htmlBytes []byte) (string, error)
- func GroupLinksByType(links []LinkResource) map[string][]LinkResource
- type Attribute
- type AudioInfo
- type Config
- type ExtractConfig
- type ImageInfo
- type LinkExtractionConfig
- type LinkInfo
- type LinkResource
- type Node
- type NodeType
- type ParseOption
- type Processor
- func (p *Processor) ClearCache()
- func (p *Processor) Close() error
- func (p *Processor) Extract(htmlBytes []byte, configs ...ExtractConfig) (*Result, error)
- func (p *Processor) ExtractAllLinks(htmlBytes []byte, configs ...LinkExtractionConfig) ([]LinkResource, error)
- func (p *Processor) ExtractBatch(htmlContents [][]byte, configs ...ExtractConfig) ([]*Result, error)
- func (p *Processor) ExtractBatchFiles(filePaths []string, configs ...ExtractConfig) ([]*Result, error)
- func (p *Processor) ExtractFromFile(filePath string, configs ...ExtractConfig) (*Result, error)
- func (p *Processor) GetStatistics() Statistics
- func (p *Processor) ResetStatistics()
- type Result
- type Statistics
- type Token
- type Tokenizer
- type VideoInfo
Constants ¶
const ( ErrorNode = stdxhtml.ErrorNode TextNode = stdxhtml.TextNode DocumentNode = stdxhtml.DocumentNode ElementNode = stdxhtml.ElementNode CommentNode = stdxhtml.CommentNode DoctypeNode = stdxhtml.DoctypeNode RawNode = stdxhtml.RawNode )
const ( ErrorToken = stdxhtml.ErrorToken TextToken = stdxhtml.TextToken StartTagToken = stdxhtml.StartTagToken EndTagToken = stdxhtml.EndTagToken SelfClosingTagToken = stdxhtml.SelfClosingTagToken CommentToken = stdxhtml.CommentToken DoctypeToken = stdxhtml.DoctypeToken )
const ( DefaultMaxInputSize = 50 * 1024 * 1024 DefaultMaxCacheEntries = 2000 DefaultWorkerPoolSize = 4 DefaultCacheTTL = time.Hour DefaultMaxDepth = 500 DefaultProcessingTimeout = 30 * time.Second )
Variables ¶
var ( // ErrInputTooLarge is returned when input exceeds MaxInputSize. ErrInputTooLarge = errors.New("html: input size exceeds maximum") // ErrInvalidHTML is returned when HTML parsing fails. ErrInvalidHTML = errors.New("html: invalid HTML") // ErrProcessorClosed is returned when operations are attempted on a closed processor. ErrProcessorClosed = errors.New("html: processor closed") // ErrMaxDepthExceeded is returned when HTML nesting exceeds MaxDepth. ErrMaxDepthExceeded = errors.New("html: max depth exceeded") // ErrInvalidConfig is returned when configuration validation fails. ErrInvalidConfig = errors.New("html: invalid config") // ErrProcessingTimeout is returned when processing exceeds ProcessingTimeout. ErrProcessingTimeout = errors.New("html: processing timeout exceeded") // ErrFileNotFound is returned when specified file cannot be read. ErrFileNotFound = errors.New("html: file not found") // ErrInvalidFilePath is returned when file path validation fails. ErrInvalidFilePath = errors.New("html: invalid file path") )
Error definitions for the `cybergodev/html` package.
var ( ErrBufferExceeded = stdxhtml.ErrBufferExceeded Parse = stdxhtml.Parse ParseFragment = stdxhtml.ParseFragment Render = stdxhtml.Render EscapeString = htmlstd.EscapeString UnescapeString = htmlstd.UnescapeString NewTokenizer = stdxhtml.NewTokenizer NewTokenizerFragment = stdxhtml.NewTokenizerFragment )
Functions ¶
func ExtractText ¶ added in v1.0.2
ExtractText extracts plain text from HTML bytes with automatic encoding detection. The method automatically detects character encoding and converts to UTF-8.
Parameters:
htmlBytes - Raw HTML bytes (auto-detects encoding)
Returns:
string - Extracted plain text in UTF-8 error - Error if extraction fails
Example:
bytes, _ := os.ReadFile("document.html")
text, _ := html.ExtractText(bytes)
func ExtractToJSON ¶ added in v1.0.4
func ExtractToMarkdown ¶ added in v1.0.4
ExtractToMarkdown converts HTML bytes to Markdown with automatic encoding detection. The method automatically detects character encoding (Windows-1252, UTF-8, GBK, Shift_JIS, etc.) from the HTML bytes and converts it to UTF-8 before processing.
Parameters:
htmlBytes - Raw HTML bytes (auto-detects encoding)
Returns:
string - Markdown content in UTF-8 error - Error if conversion fails
Example:
// HTTP response
resp, _ := http.Get(url)
bytes, _ := io.ReadAll(resp.Body)
markdown, _ := html.ExtractToMarkdown(bytes)
// File
bytes, _ := os.ReadFile("document.html")
markdown, _ := html.ExtractToMarkdown(bytes)
func GroupLinksByType ¶ added in v1.0.2
func GroupLinksByType(links []LinkResource) map[string][]LinkResource
Types ¶
type Config ¶
type Config struct {
MaxInputSize int
MaxCacheEntries int
CacheTTL time.Duration
WorkerPoolSize int
EnableSanitization bool
MaxDepth int
ProcessingTimeout time.Duration
}
func DefaultConfig ¶
func DefaultConfig() Config
type ExtractConfig ¶
type ExtractConfig struct {
ExtractArticle bool
PreserveImages bool
PreserveLinks bool
PreserveVideos bool
PreserveAudios bool
InlineImageFormat string
TableFormat string
// Encoding specifies the character encoding of the input HTML.
// If empty, the encoding will be auto-detected from meta tags or BOM.
// Common values: "utf-8", "windows-1252", "iso-8859-1", "shift_jis", etc.
Encoding string
}
func DefaultExtractConfig ¶
func DefaultExtractConfig() ExtractConfig
type LinkExtractionConfig ¶ added in v1.0.2
type LinkExtractionConfig struct {
ResolveRelativeURLs bool
BaseURL string
IncludeImages bool
IncludeVideos bool
IncludeAudios bool
IncludeCSS bool
IncludeJS bool
IncludeContentLinks bool
IncludeExternalLinks bool
IncludeIcons bool
}
func DefaultLinkExtractionConfig ¶ added in v1.0.2
func DefaultLinkExtractionConfig() LinkExtractionConfig
type LinkResource ¶ added in v1.0.2
func ExtractAllLinks ¶ added in v1.0.2
func ExtractAllLinks(htmlBytes []byte, configs ...LinkExtractionConfig) ([]LinkResource, error)
ExtractAllLinks extracts all links from HTML bytes with automatic encoding detection. The method automatically detects character encoding and converts to UTF-8.
Parameters:
htmlBytes - Raw HTML bytes (auto-detects encoding) configs - Optional link extraction configurations
Returns:
[]LinkResource - List of extracted links with UTF-8 encoded titles error - Error if extraction fails
Example:
bytes, _ := os.ReadFile("document.html")
links, _ := html.ExtractAllLinks(bytes)
type ParseOption ¶ added in v1.1.0
type ParseOption = stdxhtml.ParseOption
Type aliases for commonly used types from golang.org/x/net/html
type Processor ¶
type Processor struct {
// contains filtered or unexported fields
}
func New ¶
New creates a new HTML processor with the given configuration. If no configuration is provided, it uses DefaultConfig().
The function signature uses variadic arguments to make the config optional:
processor, err := html.New() // Uses DefaultConfig() processor, err := html.New(config) // Uses custom config
The returned processor must be closed when no longer needed:
processor, err := html.New() defer processor.Close()
func (*Processor) ClearCache ¶
func (p *Processor) ClearCache()
ClearCache clears the cache contents but preserves cumulative statistics. Use ResetStatistics to reset statistics counters.
func (*Processor) Extract ¶
func (p *Processor) Extract(htmlBytes []byte, configs ...ExtractConfig) (*Result, error)
Extract extracts content from HTML bytes with automatic encoding detection. This is the main extraction method that processes HTML bytes after detecting and converting their character encoding to UTF-8.
The method performs the following steps: 1. Validates processor state (not closed) 2. Resolves extraction configuration 3. Checks input size limits 4. Detects character encoding and converts to UTF-8 5. Processes content with caching support 6. Updates statistics and returns result
func (*Processor) ExtractAllLinks ¶ added in v1.0.2
func (p *Processor) ExtractAllLinks(htmlBytes []byte, configs ...LinkExtractionConfig) ([]LinkResource, error)
ExtractAllLinks extracts all links from HTML bytes with automatic encoding detection. The method automatically detects character encoding and converts to UTF-8 before extracting links, ensuring that link titles and text are properly decoded.
func (*Processor) ExtractBatch ¶
func (p *Processor) ExtractBatch(htmlContents [][]byte, configs ...ExtractConfig) ([]*Result, error)
func (*Processor) ExtractBatchFiles ¶
func (p *Processor) ExtractBatchFiles(filePaths []string, configs ...ExtractConfig) ([]*Result, error)
func (*Processor) ExtractFromFile ¶
func (p *Processor) ExtractFromFile(filePath string, configs ...ExtractConfig) (*Result, error)
func (*Processor) GetStatistics ¶
func (p *Processor) GetStatistics() Statistics
func (*Processor) ResetStatistics ¶ added in v1.2.0
func (p *Processor) ResetStatistics()
ResetStatistics resets all statistics counters to zero. This preserves cache entries while clearing the accumulated metrics.
type Result ¶
type Result struct {
Text string `json:"text"`
Title string `json:"title"`
Images []ImageInfo `json:"images,omitempty"`
Links []LinkInfo `json:"links,omitempty"`
Videos []VideoInfo `json:"videos,omitempty"`
Audios []AudioInfo `json:"audios,omitempty"`
ProcessingTime time.Duration `json:"processing_time_ms"`
WordCount int `json:"word_count"`
ReadingTime time.Duration `json:"reading_time_ms"`
}
func Extract ¶ added in v1.0.2
func Extract(htmlBytes []byte, configs ...ExtractConfig) (*Result, error)
Extract extracts content from HTML bytes with automatic encoding detection. The method automatically detects the character encoding (Windows-1252, UTF-8, GBK, Shift_JIS, etc.) from the HTML bytes and converts it to UTF-8 before processing.
This is the primary method for HTML content extraction when the source encoding may not be UTF-8, such as content from HTTP responses, databases, or files.
Parameters:
htmlBytes - Raw HTML bytes (auto-detects encoding) configs - Optional extraction configurations
Returns:
*Result - Extracted content with UTF-8 encoded text error - Error if extraction fails
Example:
// HTTP response
resp, _ := http.Get(url)
bytes, _ := io.ReadAll(resp.Body)
result, _ := html.Extract(bytes)
// File
bytes, _ := os.ReadFile("document.html")
result, _ := html.Extract(bytes)
func ExtractFromFile ¶ added in v1.0.2
func ExtractFromFile(filePath string, configs ...ExtractConfig) (*Result, error)
ExtractFromFile extracts content from an HTML file with automatic encoding detection. Use this when you have a file path instead of raw bytes.
Parameters:
filePath - Path to the HTML file configs - Optional extraction configurations
Returns:
*Result - Extracted content with UTF-8 encoded text error - Error if file reading or extraction fails
Example:
result, _ := html.ExtractFromFile("document.html", html.ExtractConfig{
InlineImageFormat: "markdown",
})