Files
YouduWiki/backend/store/rag/html2md.go
2026-05-21 19:52:45 +08:00

168 lines
5.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package rag
import (
"path"
"strings"
"github.com/JohannesKaufmann/dom"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/table"
"golang.org/x/net/html"
)
func NewHTML2MDConverter() *converter.Converter {
conv := converter.NewConverter(
converter.WithPlugins(
base.NewBasePlugin(),
commonmark.NewCommonmarkPlugin(),
table.NewTablePlugin(
table.WithSpanCellBehavior(table.SpanBehaviorMirror),
table.WithNewlineBehavior(table.NewlineBehaviorPreserve),
),
),
)
// 注册自定义渲染器
// attachment to md link
conv.Register.RendererFor("span", converter.TagTypeInline, renderAttachment, converter.PriorityEarly)
// task list
conv.Register.RendererFor("ul", converter.TagTypeBlock, renderTaskList, converter.PriorityEarly)
// flowchart/diagram to mermaid code block
conv.Register.RendererFor("div", converter.TagTypeBlock, renderFlowchart, converter.PriorityEarly)
return conv
}
// renderAttachment 将自定义 attachment 的 span 解析为 Markdown 链接
func renderAttachment(ctx converter.Context, w converter.Writer, node *html.Node) converter.RenderStatus {
if node.Type != html.ElementNode || node.Data != "span" {
return converter.RenderTryNext
}
// 仅处理 data-tag="attachment" 的 span
tag, ok := dom.GetAttribute(node, "data-tag")
if !ok || tag != "attachment" {
return converter.RenderTryNext
}
// 提取 URL优先 data-url其次 url
url, hasURL := dom.GetAttribute(node, "data-url")
if !hasURL || strings.TrimSpace(url) == "" {
url, hasURL = dom.GetAttribute(node, "url")
}
if !hasURL || strings.TrimSpace(url) == "" {
// 没有可用链接则交给其他渲染器
return converter.RenderTryNext
}
// 提取标题,优先 data-title其次 title无则用文件名作标题
title, hasTitle := dom.GetAttribute(node, "data-title")
if !hasTitle || strings.TrimSpace(title) == "" {
title, hasTitle = dom.GetAttribute(node, "title")
}
if !hasTitle || strings.TrimSpace(title) == "" {
// 从 URL 中提取文件名作为标题
title = path.Base(url)
}
// 写入 Markdown 链接(内联,不换行)
if _, err := w.WriteString("[" + title + "](" + url + ")"); err != nil {
return converter.RenderTryNext
}
return converter.RenderSuccess
}
// renderTaskList 渲染任务列表的自定义渲染器
func renderTaskList(ctx converter.Context, w converter.Writer, node *html.Node) converter.RenderStatus {
// 检查是否是任务列表
dataType, exists := dom.GetAttribute(node, "data-type")
if !exists || dataType != "taskList" {
return converter.RenderTryNext
}
// 遍历所有的li元素
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode && child.Data == "li" {
// 检查是否是任务项
childDataType, childExists := dom.GetAttribute(child, "data-type")
if childExists && childDataType == "taskItem" {
checkedValue, _ := dom.GetAttribute(child, "data-checked")
isChecked := checkedValue == "true"
// 获取文本内容
textContent := getTextFromTaskItem(child)
// 写入checkbox markdown
if isChecked {
if _, err := w.WriteString("- [x] " + textContent + "\n"); err != nil {
return converter.RenderTryNext
}
} else {
if _, err := w.WriteString("- [ ] " + textContent + "\n"); err != nil {
return converter.RenderTryNext
}
}
}
}
}
return converter.RenderSuccess
}
// getTextFromTaskItem 从任务项中提取文本内容
func getTextFromTaskItem(node *html.Node) string {
var textContent strings.Builder
// 遍历所有子节点,提取文本
var extractText func(*html.Node)
extractText = func(n *html.Node) {
if n.Type == html.TextNode {
textContent.WriteString(n.Data)
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
extractText(child)
}
}
extractText(node)
return strings.TrimSpace(textContent.String())
}
// renderFlowchart 将流程图 div 转换为 Mermaid 代码块
func renderFlowchart(ctx converter.Context, w converter.Writer, node *html.Node) converter.RenderStatus {
if node.Type != html.ElementNode || node.Data != "div" {
return converter.RenderTryNext
}
// 仅处理 data-type="flow" 的 div
dataType, ok := dom.GetAttribute(node, "data-type")
if !ok || dataType != "flow" {
return converter.RenderTryNext
}
// 提取 data-code 属性
code, hasCode := dom.GetAttribute(node, "data-code")
if !hasCode || strings.TrimSpace(code) == "" {
return converter.RenderTryNext
}
// 解码 HTML 实体
code = html.UnescapeString(code)
// 处理转义的换行符
code = strings.ReplaceAll(code, "\\n", "\n")
// 写入 Mermaid 代码块
if _, err := w.WriteString("\n```mermaid\n"); err != nil {
return converter.RenderTryNext
}
if _, err := w.WriteString(code); err != nil {
return converter.RenderTryNext
}
if _, err := w.WriteString("\n```\n\n"); err != nil {
return converter.RenderTryNext
}
return converter.RenderSuccess
}