Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ExtractImagePaths ¶
func StripURLImages ¶
Types ¶
type FileTransformer ¶
var TransformHtmlToMd FileTransformer = func(file File) (File, error) { if file.Type != TypeHTML { return File{}, fmt.Errorf("expected html, got %q", file.Type) } if file.TextContent == "" { return File{}, fmt.Errorf("html file has no text content") } data := []byte(file.TextContent) article, readErr := readability.FromReader(bytes.NewReader(data), nil) var htmlContent string var title string if readErr == nil && article.Content != "" { htmlContent = article.Content title = article.Title } else { htmlContent = file.TextContent } conv := converter.NewConverter( converter.WithPlugins( base.NewBasePlugin(), commonmark.NewCommonmarkPlugin(), ), ) conv.Register.PreRenderer(promoteCodeLangAttrs, 100) md, err := conv.ConvertString(htmlContent) if err != nil { return File{}, fmt.Errorf("convert html: %w", err) } md = stripFileScheme(md) if title != "" { md = "# " + title + "\n\n" + md } images := ExtractImagePaths(md) return File{Type: TypeMD, BasenameStripped: file.BasenameStripped, Dir: file.Dir, TextContent: md, ImagePaths: images}, nil }
var TransformHtmlToPdf FileTransformer = func(file File) (File, error) { if file.Type != TypeHTML { return File{}, fmt.Errorf("expected html, got %q", file.Type) } if file.TextContent == "" { return File{}, fmt.Errorf("html file has no text content") } u := launcher.New().Headless(true).Logger(io.Discard).MustLaunch() browser := rod.New().ControlURL(u).Logger(utils.LoggerQuiet).MustConnect() defer browser.MustClose() page := browser.MustPage("") page.MustSetDocumentContent(file.TextContent) page.MustWaitStable() pdf, err := page.PDF(&proto.PagePrintToPDF{ PrintBackground: true, MarginTop: floatPtr(0.4), MarginBottom: floatPtr(0.4), MarginLeft: floatPtr(0.4), MarginRight: floatPtr(0.4), }) if err != nil { return File{}, fmt.Errorf("print to pdf: %w", err) } buf, err := io.ReadAll(pdf) if err != nil { return File{}, fmt.Errorf("read pdf: %w", err) } return File{ Type: TypePDF, BasenameStripped: file.BasenameStripped, BinaryContent: buf, ImagePaths: file.ImagePaths, Dir: file.Dir, }, nil }
var TransformMdToEpub FileTransformer = func(file File) (File, error) { if file.Type != TypeMD { return File{}, fmt.Errorf("expected md, got %q", file.Type) } if file.TextContent == "" { return File{}, fmt.Errorf("md file has no text content") } md := file.TextContent images := ExtractImagePaths(md) title := file.BasenameStripped chs := splitChapters(md) if len(chs) > 0 && chs[0].title != "Introduction" { title = chs[0].title } e, err := epub.NewEpub(title) if err != nil { return File{}, fmt.Errorf("create epub: %w", err) } cssTmp := filepath.Join(os.TempDir(), "tt-highlight.css") if err := os.WriteFile(cssTmp, []byte(highlightCSS), 0644); err != nil { return File{}, fmt.Errorf("write highlight css: %w", err) } defer os.Remove(cssTmp) cssPath, err := e.AddCSS(cssTmp, "highlight.css") if err != nil { return File{}, fmt.Errorf("add css: %w", err) } rewrite := map[string]string{} for _, imgURL := range images { fsPath, _ := url.PathUnescape(imgURL) if !filepath.IsAbs(fsPath) && file.Dir != "" { fsPath = filepath.Join(file.Dir, fsPath) } internalPath, err := e.AddImage(fsPath, "") if err != nil { continue } rewrite[imgURL] = internalPath } md = RewriteImagePaths(md, rewrite) chs = splitChapters(md) for _, ch := range chs { body := mdToHTML(ch.body) if _, err := e.AddSection(body, ch.title, "", cssPath); err != nil { return File{}, fmt.Errorf("add epub section: %w", err) } } tmpPath := filepath.Join(os.TempDir(), file.BasenameStripped+".epub") if err := e.Write(tmpPath); err != nil { return File{}, fmt.Errorf("write epub: %w", err) } data, err := os.ReadFile(tmpPath) if err != nil { return File{}, fmt.Errorf("read epub: %w", err) } os.Remove(tmpPath) return File{Type: TypeEPUB, BasenameStripped: file.BasenameStripped, BinaryContent: data}, nil }
var TransformMdToHtml FileTransformer = func(file File) (File, error) { if file.Type != TypeMD { return File{}, fmt.Errorf("expected md, got %q", file.Type) } if file.TextContent == "" { return File{}, fmt.Errorf("md file has no text content") } var body bytes.Buffer if err := renderMarkdown(file.TextContent, &body); err != nil { return File{}, fmt.Errorf("render markdown: %w", err) } out := "<!doctype html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">\n" + "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n" + "<style>\n" + githubMarkdownCSS + "\n</style>\n" + "</head>\n" + "<body class=\"markdown-body\" style=\"max-width:980px;margin:0 auto;padding:32px\">\n" + body.String() + "</body>\n</html>\n" return File{ Type: TypeHTML, BasenameStripped: file.BasenameStripped, TextContent: out, ImagePaths: file.ImagePaths, Dir: file.Dir, }, nil }
var TransformMdToMd FileTransformer = func(file File) (File, error) { if file.Type != TypeMD { return File{}, fmt.Errorf("expected md, got %q", file.Type) } if file.TextContent == "" { return File{}, fmt.Errorf("md file has no text content") } images := ExtractImagePaths(file.TextContent) return File{Type: TypeMD, BasenameStripped: file.BasenameStripped, Dir: file.Dir, TextContent: file.TextContent, ImagePaths: images}, nil }
var TransformPdfToMd FileTransformer = func(file File) (File, error) { if file.Type != TypePDF { return File{}, fmt.Errorf("expected pdf, got %q", file.Type) } if len(file.BinaryContent) == 0 { return File{}, fmt.Errorf("pdf file has no binary content") } doc, err := fitz.NewFromMemory(file.BinaryContent) if err != nil { return File{}, fmt.Errorf("open pdf: %w", err) } defer doc.Close() tmpDir := file.TempDirPath imgIdx := 0 var raw []annotatedLine numPages := doc.NumPage() for i := 0; i < numPages; i++ { pageHTML, err := doc.HTML(i, false) if err != nil { return File{}, fmt.Errorf("extract pdf page: %w", err) } pageLines, pageHeight := extractLines(pageHTML, tmpDir, &imgIdx) for _, l := range pageLines { raw = append(raw, annotatedLine{line: l, page: i, pageHeight: pageHeight}) } if OnProgress != nil { OnProgress(i+1, numPages) } } if len(raw) == 0 { return File{Type: TypeMD, BasenameStripped: file.BasenameStripped, TextContent: ""}, nil } bodySize := detectBodyFontSize(raw) var blocks []string var current []string var prev annotatedLine flush := func() { if len(current) > 0 { blocks = append(blocks, strings.Join(current, " ")) current = nil } } for i, al := range raw { line := al.line heading := headingLevel(line.fontSize, bodySize) if heading > 0 { if pdfHackLabelRe.MatchString(line.text) { flush() current = append(current, `\`+line.text) prev = al continue } flush() blocks = append(blocks, fmt.Sprintf("%s %s", strings.Repeat("#", heading), line.text)) prev = al continue } if i == 0 { current = append(current, line.text) prev = al continue } sameFontSize := math.Abs(line.fontSize-prev.line.fontSize) < pdfFontSizeTolerance crossPage := al.page != prev.page var sameParagraph bool if crossPage { sameParagraph = sameFontSize && math.Abs(line.left-prev.line.left) < pdfLeftMarginTolerance } else { gap := line.top - prev.line.top lineHeight := math.Max(prev.line.fontSize*pdfLineHeightMultiplier, pdfMinLineHeight) sameParagraph = gap > 0 && gap < lineHeight && sameFontSize } if sameParagraph { current = joinLines(current, line.text) } else { flush() current = append(current, line.text) } prev = al } flush() if meta := doc.Metadata(); meta["title"] != "" { title := meta["title"] norm := strings.ToLower(strings.TrimSpace(title)) var deduped []string for _, b := range blocks { if strings.HasPrefix(b, "# ") { h1 := strings.ToLower(strings.TrimSpace(strings.TrimPrefix(b, "# "))) if h1 == norm || strings.Contains(h1, norm) || strings.Contains(norm, h1) { continue } } deduped = append(deduped, b) } blocks = append([]string{"# " + title}, deduped...) } md := strings.Join(blocks, "\n\n") md = strings.ReplaceAll(md, "\x00", "") for _, pair := range [][2]string{{"\ufb00", "ff"}, {"\ufb01", "fi"}, {"\ufb02", "fl"}, {"\ufb03", "ffi"}, {"\ufb04", "ffl"}} { md = strings.ReplaceAll(md, pair[0], pair[1]) } images := ExtractImagePaths(md) return File{Type: TypeMD, BasenameStripped: file.BasenameStripped, Dir: file.Dir, TextContent: md, ImagePaths: images}, nil }
var TransformTxtToMd FileTransformer = func(file File) (File, error) { if file.Type != TypeTXT { return File{}, fmt.Errorf("expected txt, got %q", file.Type) } if file.TextContent == "" { return File{}, fmt.Errorf("txt file has no text content") } images := ExtractImagePaths(file.TextContent) return File{Type: TypeMD, BasenameStripped: file.BasenameStripped, Dir: file.Dir, TextContent: file.TextContent, ImagePaths: images}, nil }
Click to show internal directories.
Click to hide internal directories.