init push

2026-05-21 19:52:45 +08:00
commit e3f75311ab
1280 changed files with 179173 additions and 0 deletions
--- a/backend/utils/DFA.go
+++ b/backend/utils/DFA.go
@@ -0,0 +1,182 @@
+package utils
+
+import (
+	"errors"
+	"sync"
+)
+
+var (
+	dfaInstance map[string]*DFAInstance
+	mu          sync.RWMutex
+)
+
+type DFAInstance struct {
+	DFA      *DFA
+	BuffSize int
+}
+
+// GetDFA returns the singleton instance of DFA
+func GetDFA(kbID string) *DFAInstance {
+	mu.RLock()
+	defer mu.RUnlock()
+	return dfaInstance[kbID]
+}
+
+// InitDFA Initialize a new DFA. --> this func used by pro
+func InitDFA(kbID string, words []string) {
+	mu.Lock()
+	defer mu.Unlock()
+	newDFA := &DFA{
+		Root: NewTrieNode(),
+	}
+	var BuffSize int // 默认为0
+	for _, word := range words {
+		newDFA.AddWord(word)
+		if BuffSize < len([]rune(word)) {
+			BuffSize = len([]rune(word))
+		}
+	}
+	if dfaInstance == nil {
+		dfaInstance = make(map[string]*DFAInstance)
+	}
+	dfaInstance[kbID] = &DFAInstance{
+		DFA:      newDFA,
+		BuffSize: BuffSize,
+	}
+}
+
+// TrieNode Define the nodes of DFA
+type TrieNode struct {
+	Children map[rune]*TrieNode
+	IsEnd    bool
+}
+
+// NewTrieNode Create a new Trie node
+func NewTrieNode() *TrieNode {
+	return &TrieNode{
+		Children: make(map[rune]*TrieNode),
+		IsEnd:    false,
+	}
+}
+
+// DFA The structure contains the root node of the DFA
+type DFA struct {
+	Root *TrieNode
+}
+
+// AddWord Add sensitive words to DFA
+func (d *DFA) AddWord(word string) {
+	node := d.Root
+	for _, char := range word {
+		if _, exists := node.Children[char]; !exists {
+			node.Children[char] = NewTrieNode()
+		}
+		node = node.Children[char]
+	}
+	node.IsEnd = true
+}
+
+// UpdateOldWord update old word
+func (d *DFA) UpdateOldWord(oldWord, newWord string) {
+	d.DeleteWord(oldWord)
+	d.AddWord(newWord)
+}
+
+// DeleteWord delete word
+func (d *DFA) DeleteWord(word string) bool {
+	result := []rune(word)
+	// 辅助函数用于递归删除节点
+	var deleteNode func(node *TrieNode, index int) bool
+	deleteNode = func(node *TrieNode, index int) bool {
+		if index == len(result) {
+			// 如果该词不存在，直接返回
+			if !node.IsEnd {
+				return false
+			}
+			// 清除该词的结束标记
+			node.IsEnd = false
+			// 如果该节点没有子节点，可以删除
+			return len(node.Children) == 0
+		}
+
+		char := result[index]
+		child, exists := node.Children[char]
+		if !exists {
+			return false // 如果路径不存在，则不做任何操作
+		}
+
+		// 递归删除子节点
+		shouldDeleteChild := deleteNode(child, index+1)
+		if shouldDeleteChild {
+			// 删除当前节点的子节点
+			delete(node.Children, char)
+			// 如果当前节点没有其他子节点且不是词尾节点，返回 true
+			return len(node.Children) == 0 && !node.IsEnd
+		}
+		return false
+	}
+
+	// 调用递归函数删除指定的词
+	return deleteNode(d.Root, 0)
+}
+
+// DeleteWordBatch delete word batch
+func (d *DFA) DeleteWordBatch(words []string) {
+	wg := sync.WaitGroup{}
+	for _, word := range words {
+		wg.Add(1)
+		go func() {
+			d.DeleteWord(word)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+// Filter the input text and replace sensitive words
+func (d *DFA) Filter(text string) string {
+	result := []rune(text)             // 转化为rune
+	for i := 0; i < len(result); i++ { // 外层循环，遍历每个字符作为起始点
+		node := d.Root
+		j := i
+		for j < len(result) { // 内层循环，尝试匹配敏感词
+			if nextNode, exists := node.Children[result[j]]; exists { // 如果当前字符在子节点中存在
+				node = nextNode // 下移
+				if node.IsEnd { // 是否为结尾，即匹配到敏感词，替换为*
+					for k := i; k <= j; k++ {
+						result[k] = '🚫'
+					}
+				}
+				j++ // next char
+			} else {
+				break
+			}
+		}
+	}
+	return string(result)
+}
+
+// Check  if the input text contains sensitive words
+func (d *DFA) Check(text string) error {
+	result := []rune(text)
+	for i := 0; i < len(result); {
+		node := d.Root
+		start := i
+		matched := false
+		for j := i; j < len(result); j++ {
+			char := result[j]
+			if nextNode, exists := node.Children[char]; exists {
+				node = nextNode
+				if node.IsEnd {
+					return errors.New("包含敏感词: " + string(result[start:j+1]))
+				}
+			} else {
+				break
+			}
+		}
+		if !matched {
+			i++
+		}
+	}
+	return nil
+}
--- a/backend/utils/epub.go
+++ b/backend/utils/epub.go
@@ -0,0 +1,430 @@
+package utils
+
+import (
+	"archive/zip"
+	"bytes"
+	"context"
+	"encoding/xml"
+	"errors"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
+	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
+	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
+	"github.com/chaitin/panda-wiki/domain"
+	"github.com/chaitin/panda-wiki/log"
+	"github.com/chaitin/panda-wiki/store/s3"
+	"github.com/google/uuid"
+	"github.com/minio/minio-go/v7"
+	"golang.org/x/sync/semaphore"
+)
+
+type EpubConverter struct {
+	logger      *log.Logger
+	mu          sync.Mutex
+	minioClient *s3.MinioClient
+	// relative path -> oss path
+	resources map[string]string
+	// id -> relative path
+	resourcesIdMap map[string]Item
+	// relative path -> id
+	relativePath map[string]string
+}
+
+func NewEpubConverter(logger *log.Logger, minio *s3.MinioClient) *EpubConverter {
+	return &EpubConverter{
+		logger:         logger.WithModule("epubConverter"),
+		minioClient:    minio,
+		resources:      make(map[string]string),
+		resourcesIdMap: make(map[string]Item),
+		relativePath:   make(map[string]string),
+	}
+}
+
+func (e *EpubConverter) Convert(ctx context.Context, kbID string, data *multipart.FileHeader) (string, []byte, error) {
+	reader, err := data.Open()
+	if err != nil {
+		return "", nil, err
+	}
+	defer reader.Close()
+	zipReader, err := zip.NewReader(reader, data.Size)
+	if err != nil {
+		return "", nil, err
+	}
+	if err := valid(zipReader); err != nil {
+		return "", nil, err
+	}
+
+	// read ./path/to/content.opf
+	var p *Package
+	if p, err = getOpf(zipReader); err != nil {
+		return "", nil, err
+	}
+
+	for _, item := range p.Manifest.Items {
+		e.resourcesIdMap[item.ID] = item
+		e.relativePath[item.Href] = item.ID
+	}
+
+	// resolve resource file
+	if err := e.uploadFile(ctx, kbID, zipReader); err != nil {
+		return "", nil, err
+	}
+
+	conv := converter.NewConverter(
+		converter.WithPlugins(
+			base.NewBasePlugin(),
+			commonmark.NewCommonmarkPlugin(
+				commonmark.WithStrongDelimiter("__"),
+			),
+		),
+	)
+	conv.Register.TagType("a", converter.TagTypeRemove, converter.PriorityStandard)
+
+	res := make(map[string]*bytes.Buffer)
+	var toc []map[string]string
+	for _, zipfile := range zipReader.File {
+		ext := strings.ToLower(filepath.Ext(zipfile.Name))
+		if ext == ".ncx" {
+			file, err := zipfile.Open()
+			if err != nil {
+				return "", nil, err
+			}
+			defer file.Close()
+			toc, err = ParseNCX(file)
+			if err != nil {
+				return "", nil, err
+			}
+		}
+		file, err := zipfile.Open()
+		if err != nil {
+			return "", nil, err
+		}
+		defer file.Close()
+		htmlStr, err := io.ReadAll(file)
+		if err != nil {
+			return "", nil, err
+		}
+		mdStr, err := conv.ConvertString((string(htmlStr)))
+		if err != nil {
+			return "", nil, err
+		}
+		e.logger.Info("convert File", "file name", clearFileName(zipfile.Name))
+		res[clearFileName(zipfile.Name)] = bytes.NewBufferString(mdStr)
+	}
+	// page sequence
+	result := bytes.NewBuffer(nil)
+	for _, href := range p.Guide.References {
+		if r, ok := res[clearFileName(href.Href)]; ok {
+			if _, err := io.Copy(result, r); err != nil {
+				return "", nil, err
+			}
+			result.WriteString("\n\n")
+		}
+	}
+	result.WriteString("# 目录\n\n")
+	for _, v := range toc {
+		fmt.Fprintf(result, "- [%s](#%s)\n", v["title"], v["playOrder"])
+	}
+	temp := make(map[string]string)
+	for _, v := range toc {
+		temp[v["src"]] = v["playOrder"]
+	}
+	for _, itemRef := range p.Spine.ItemRefs {
+		title := temp[e.resourcesIdMap[itemRef.IDRef].Href]
+		e.logger.Debug("add File", "file name", clearFileName(e.resourcesIdMap[itemRef.IDRef].Href))
+		if r, ok := res[clearFileName(e.resourcesIdMap[itemRef.IDRef].Href)]; ok {
+			result.WriteString("<span id=" + title + "></span>\n\n")
+			if _, err := io.Copy(result, r); err != nil {
+				return "", nil, err
+			}
+			result.WriteString("\n\n")
+		}
+	}
+	str, err := e.exchangeUrl(ctx, result.String())
+	return p.Metadata.Title, str, err
+}
+
+func clearFileName(str string) string {
+	str = filepath.Base(str)
+	return strings.Split(str, "#")[0]
+}
+
+func (e *EpubConverter) uploadFile(ctx context.Context, kbID string, zipReader *zip.Reader) error {
+	var wg sync.WaitGroup
+	errCh := make(chan error, len(zipReader.File))
+	sem := semaphore.NewWeighted(10) // 控制并发数为10
+
+	for _, f := range zipReader.File {
+		if isSkippableFile(f.Name) {
+			continue
+		}
+
+		if err := sem.Acquire(ctx, 1); err != nil {
+			return err // 如果获取信号量失败（如context取消），直接返回错误
+		}
+
+		wg.Add(1)
+
+		go func(f *zip.File) {
+			defer func() {
+				sem.Release(1)
+				wg.Done()
+			}()
+
+			if err := e.processFile(ctx, f, kbID); err != nil {
+				errCh <- err
+			}
+		}(f)
+	}
+
+	go func() {
+		wg.Wait()
+		close(errCh)
+	}()
+
+	return <-errCh // 返回第一个错误（或 nil）
+}
+
+func (e *EpubConverter) processFile(ctx context.Context, f *zip.File, kbID string) error {
+	file, err := f.Open()
+	if err != nil {
+		return fmt.Errorf("打开文件 %s 失败: %v", f.Name, err)
+	}
+	defer file.Close()
+
+	ext := strings.ToLower(filepath.Ext(f.Name))
+	ossPath := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
+
+	e.mu.Lock()
+	e.resources[f.Name] = fmt.Sprintf("/%s/%s", domain.Bucket, ossPath)
+	e.mu.Unlock()
+	_, err = e.minioClient.PutObject(
+		ctx,
+		domain.Bucket,
+		ossPath,
+		file,
+		f.FileInfo().Size(),
+		minio.PutObjectOptions{
+			ContentType:  e.resourcesIdMap[e.relativePath[f.Name]].MediaType,
+			UserMetadata: map[string]string{"originalname": filepath.Base(f.Name)},
+		},
+	)
+	return err
+}
+
+func isSkippableFile(name string) bool {
+	skipExts := map[string]bool{".html": true, ".css": true, ".xml": true /* 其他扩展名 */}
+	return name == "META-INF/container.xml" || name == "mimetype" || skipExts[filepath.Ext(name)]
+}
+
+func (e *EpubConverter) exchangeUrl(ctx context.Context, content string) ([]byte, error) {
+	// 将字符串转换为字节切片
+	mdContent := []byte(content)
+
+	// 定义 getUrl 函数，使用资源映射表替换 URL
+	getUrl := func(ctx context.Context, originUrl *string) (string, error) {
+		if originUrl == nil {
+			return "", fmt.Errorf("originUrl is nil")
+		}
+
+		// 查找资源映射
+		if newUrl, exists := e.resources[*originUrl]; exists {
+			return newUrl, nil
+		}
+
+		// 未找到映射，返回原始 URL
+		return *originUrl, nil
+	}
+
+	// 使用 ExchangeMarkDownImageUrl 处理 Markdown
+	processedContent, err := ExchangeMarkDownImageUrl(
+		ctx,
+		mdContent,
+		getUrl,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("failed to exchange URLs: %w", err)
+	}
+
+	return []byte(processedContent), nil
+}
+
+// 获取 <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+func getFullPath(zipReader *zip.Reader) (string, error) {
+	// 定义 XML 结构体来匹配 container.xml 的内容
+	type Rootfile struct {
+		FullPath  string `xml:"full-path,attr"`
+		MediaType string `xml:"media-type,attr"`
+	}
+	type Rootfiles struct {
+		Rootfile []Rootfile `xml:"rootfile"`
+	}
+
+	type Container struct {
+		XMLName   xml.Name  `xml:"container"`
+		Xmlns     string    `xml:"xmlns,attr"`
+		Version   string    `xml:"version,attr"`
+		Rootfiles Rootfiles `xml:"rootfiles"`
+	}
+
+	for _, f := range zipReader.File {
+		if f.Name == "META-INF/container.xml" {
+			// parse container.xml
+			r, err := f.Open()
+			if err != nil {
+				return "", err
+			}
+			defer r.Close()
+			de := xml.NewDecoder(r)
+			var c Container
+			if err := de.Decode(&c); err != nil {
+				return "", fmt.Errorf("failed to decode container.xml: %w", err)
+			}
+			if c.Rootfiles.Rootfile[0].FullPath == "" {
+				return "", errors.New("full-path not found in container.xml")
+			}
+			return c.Rootfiles.Rootfile[0].FullPath, nil
+		}
+	}
+	return "", errors.New("container.xml not found")
+}
+
+func valid(zipReader *zip.Reader) error {
+	for _, f := range zipReader.File {
+		if f.Name == "mimetype" {
+			r, err := f.Open()
+			if err != nil {
+				return err
+			}
+			defer r.Close()
+			var buf bytes.Buffer
+			if _, err := buf.ReadFrom(r); err != nil {
+				return fmt.Errorf("failed to read mimetype: %w", err)
+			}
+			if buf.String() != "application/epub+zip" {
+				return errors.New("invalid mimetype")
+			}
+		}
+	}
+	return nil
+}
+
+// Package represents the root element of the OPF file
+type Package struct {
+	XMLName  xml.Name `xml:"package"`
+	Spine    Spine    `xml:"spine"` // 内容
+	Guide    Guide    `xml:"guide"` // 封面
+	Manifest struct { // 资源清单
+		Items []Item `xml:"item"` // 资源
+	} `xml:"manifest"`
+	Metadata struct { // 元数据
+		Title string `xml:"dc:title"` // 标题
+	} `xml:"metadata"`
+}
+
+// Spine represents the spine section of the OPF file
+type Spine struct {
+	Toc      string    `xml:"toc,attr"`
+	ItemRefs []ItemRef `xml:"itemref"`
+}
+
+// ItemRef represents an itemref in the spine section
+type ItemRef struct {
+	IDRef string `xml:"idref,attr"`
+}
+
+// Guide represents the guide section of the OPF file
+type Guide struct {
+	References []Reference `xml:"reference"`
+}
+
+// Reference represents a reference in the guide section
+type Reference struct {
+	Href  string `xml:"href,attr"`
+	Title string `xml:"title,attr"`
+	Type  string `xml:"type,attr"`
+}
+
+// Item represents an item in the manifest section
+type Item struct {
+	ID        string `xml:"id,attr"`
+	Href      string `xml:"href,attr"`
+	MediaType string `xml:"media-type,attr"`
+}
+
+func getOpf(zipReader *zip.Reader) (*Package, error) {
+	// read ./META_INF/container.xml
+	opfPath, err := getFullPath(zipReader)
+	if err != nil {
+		return nil, err
+	}
+	// read ./OEBPS/content.opf
+	for _, f := range zipReader.File {
+		if f.Name == opfPath {
+			r, err := f.Open()
+			if err != nil {
+				return nil, err
+			}
+			defer r.Close()
+			var p Package
+			de := xml.NewDecoder(r)
+			if err := de.Decode(&p); err != nil {
+				return nil, fmt.Errorf("解码OPF文件失败: %v", err)
+			}
+			return &p, nil
+		}
+	}
+	return nil, errors.New("content.opf not found")
+}
+
+// NCX 结构体定义
+type NCX struct {
+	XMLName xml.Name `xml:"ncx"`
+	NavMap  NavMap   `xml:"navMap"`
+}
+
+type NavMap struct {
+	NavPoints []NavPoint `xml:"navPoint"`
+}
+
+type NavPoint struct {
+	ID        string   `xml:"id,attr"`
+	PlayOrder string   `xml:"playOrder,attr"`
+	NavLabel  NavLabel `xml:"navLabel"`
+	Content   Content  `xml:"content"`
+}
+
+type NavLabel struct {
+	Text string `xml:"text"`
+}
+
+type Content struct {
+	Src string `xml:"src,attr"`
+}
+
+// ParseNCX 解析 NCX 文件并返回目录信息
+func ParseNCX(r io.Reader) ([]map[string]string, error) {
+	var ncx NCX
+	if err := xml.NewDecoder(r).Decode(&ncx); err != nil {
+		return nil, fmt.Errorf("解析NCX失败: %v", err)
+	}
+
+	var toc []map[string]string
+	for _, np := range ncx.NavMap.NavPoints {
+		entry := map[string]string{
+			"id":        np.ID,
+			"playOrder": np.PlayOrder,
+			"title":     np.NavLabel.Text,
+			"src":       np.Content.Src,
+		}
+		toc = append(toc, entry)
+	}
+
+	return toc, nil
+}
--- a/backend/utils/feed.go
+++ b/backend/utils/feed.go
@@ -0,0 +1,239 @@
+package utils
+
+import (
+	"encoding/json"
+	"encoding/xml"
+	"fmt"
+	"strings"
+)
+
+// FeedItem represents a single item in any feed format
+// FeedItem 表示任意Feed格式中的单个条目
+// 字段说明：
+// Title: 条目标题
+// Link: 条目链接（URL）
+// Description: 条目描述内容
+// Published: 发布时间（字符串格式，具体格式由Feed源决定）
+type FeedItem struct {
+	Title       string // 条目标题
+	Link        string // 条目链接URL
+	Description string // 条目描述内容
+	Published   string // 发布时间（字符串格式）
+}
+
+// Feed represents a generic feed structure
+type Feed struct {
+	Title       string
+	Description string
+	Link        string
+	Items       []FeedItem
+}
+
+// cleanXMLContent removes illegal XML characters from the content
+func cleanXMLContent(content string) string {
+	return strings.Map(func(r rune) rune {
+		// Check if the character is a valid XML character
+		// XML 1.0 spec: https://www.w3.org/TR/xml/#charsets
+		if r == 0x9 || r == 0xA || r == 0xD || (r >= 0x20 && r <= 0xD7FF) || (r >= 0xE000 && r <= 0xFFFD) || (r >= 0x10000 && r <= 0x10FFFF) {
+			return r
+		}
+		return -1 // Remove invalid characters
+	}, content)
+}
+
+// ParseFeed 解析指定URL的Feed内容，返回通用Feed结构
+// 参数：
+// url: 要解析的Feed内容URL
+// 返回值：
+// *Feed: 解析后的通用Feed结构（包含标题、描述、链接和条目列表）
+// error: 解析过程中出现的错误（网络错误、格式不支持等）
+func ParseFeed(url string) (*Feed, error) {
+	// Get feed content
+	content, err := HTTPGet(url)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get feed content: %v", err)
+	}
+
+	// Decode content
+	decoded := DecodeBytes(content)
+	// Clean illegal XML characters
+	cleaned := cleanXMLContent(decoded)
+	decodedBytes := []byte(cleaned)
+
+	// Try to detect feed format and parse accordingly
+	if strings.Contains(cleaned, "<rss") {
+		return parseRSS(decodedBytes)
+	} else if strings.Contains(cleaned, "<feed") {
+		return parseAtom(decodedBytes)
+	} else if strings.Contains(cleaned, "\"version\":") {
+		return parseJSONFeed(decodedBytes)
+	}
+
+	return nil, fmt.Errorf("unsupported feed format")
+}
+
+// parseRSS 解析RSS格式（如RSS 2.0）的内容
+// 参数：content - RSS格式的字节内容
+// 返回值：解析后的通用Feed结构或错误
+// 注意：处理链接时按以下优先级获取：link标签的href属性 > link标签文本值 > Atom扩展链接 > Guid（永久链接）
+func parseRSS(content []byte) (*Feed, error) {
+	type RSSFeed struct {
+		XMLName xml.Name `xml:"rss"`
+		Channel struct {
+			Title       string `xml:"title"`
+			Description string `xml:"description"`
+			Link        string `xml:"link"`
+			AtomLink    struct {
+				Href string `xml:"href,attr"`
+			} `xml:"http://www.w3.org/2005/Atom link"`
+			Items []struct {
+				Title string `xml:"title"`
+				Links []struct {
+					Href  string `xml:"href,attr"`
+					Value string `xml:",chardata"`
+				} `xml:"link"`
+				Description string `xml:"description"`
+				PubDate     string `xml:"pubDate"`
+				Guid        struct {
+					IsPermaLink string `xml:"isPermaLink,attr"`
+					Value       string `xml:",chardata"`
+				} `xml:"guid"`
+				AtomLink struct {
+					Href string `xml:"href,attr"`
+				} `xml:"http://www.w3.org/2005/Atom link"`
+			} `xml:"item"`
+		} `xml:"channel"`
+	}
+
+	var rssFeed RSSFeed
+	if err := xml.Unmarshal(content, &rssFeed); err != nil {
+		return nil, fmt.Errorf("failed to parse RSS: %v", err)
+	}
+
+	feed := &Feed{
+		Title:       rssFeed.Channel.Title,
+		Description: rssFeed.Channel.Description,
+		Link:        rssFeed.Channel.Link,
+		Items:       make([]FeedItem, 0),
+	}
+
+	for _, item := range rssFeed.Channel.Items {
+		feedItem := FeedItem{
+			Title:       item.Title,
+			Description: item.Description,
+			Published:   item.PubDate,
+		}
+
+		// Try to get link from various sources in order of preference
+		if len(item.Links) > 0 {
+			// Try href attribute first, then value
+			if item.Links[0].Href != "" {
+				feedItem.Link = item.Links[0].Href
+			} else if item.Links[0].Value != "" {
+				feedItem.Link = item.Links[0].Value
+			}
+		} else if item.AtomLink.Href != "" {
+			feedItem.Link = item.AtomLink.Href
+		} else if item.Guid.Value != "" && (item.Guid.IsPermaLink == "" || item.Guid.IsPermaLink == "true") {
+			feedItem.Link = item.Guid.Value
+		}
+
+		feed.Items = append(feed.Items, feedItem)
+	}
+
+	return feed, nil
+}
+
+// parseAtom 解析Atom 1.0格式的内容
+// 参数：content - Atom格式的字节内容
+// 返回值：解析后的通用Feed结构或错误
+// 注意：Feed链接取第一个link元素的href属性（建议优先使用rel="alternate"的链接）
+func parseAtom(content []byte) (*Feed, error) {
+	type AtomFeed struct {
+		XMLName  xml.Name `xml:"feed"`
+		Title    string   `xml:"title"`
+		Subtitle string   `xml:"subtitle"`
+		Link     []struct {
+			Href string `xml:"href,attr"`
+		} `xml:"link"`
+		Entries []struct {
+			Title string `xml:"title"`
+			Link  []struct {
+				Href string `xml:"href,attr"`
+			} `xml:"link"`
+			Summary string `xml:"summary"`
+			Updated string `xml:"updated"`
+		} `xml:"entry"`
+	}
+
+	var atomFeed AtomFeed
+	if err := xml.Unmarshal(content, &atomFeed); err != nil {
+		return nil, fmt.Errorf("failed to parse Atom: %v", err)
+	}
+
+	feed := &Feed{
+		Title:       atomFeed.Title,
+		Description: atomFeed.Subtitle,
+		Items:       make([]FeedItem, 0),
+	}
+
+	if len(atomFeed.Link) > 0 {
+		feed.Link = atomFeed.Link[0].Href
+	}
+
+	for _, entry := range atomFeed.Entries {
+		item := FeedItem{
+			Title:       entry.Title,
+			Description: entry.Summary,
+			Published:   entry.Updated,
+		}
+		if len(entry.Link) > 0 {
+			item.Link = entry.Link[0].Href
+		}
+		feed.Items = append(feed.Items, item)
+	}
+
+	return feed, nil
+}
+
+// parseJSONFeed 解析JSON Feed格式（如1.1版本）的内容
+// 参数：content - JSON Feed格式的字节内容
+// 返回值：解析后的通用Feed结构或错误
+// 字段映射：home_page_url -> Feed.Link; date_published -> FeedItem.Published
+func parseJSONFeed(content []byte) (*Feed, error) {
+	type JSONFeed struct {
+		Version     string `json:"version"`
+		Title       string `json:"title"`
+		Description string `json:"description"`
+		HomePageURL string `json:"home_page_url"`
+		Items       []struct {
+			Title         string `json:"title"`
+			URL           string `json:"url"`
+			ContentText   string `json:"content_text"`
+			DatePublished string `json:"date_published"`
+		} `json:"items"`
+	}
+
+	var jsonFeed JSONFeed
+	if err := json.Unmarshal(content, &jsonFeed); err != nil {
+		return nil, fmt.Errorf("failed to parse JSON Feed: %v", err)
+	}
+
+	feed := &Feed{
+		Title:       jsonFeed.Title,
+		Description: jsonFeed.Description,
+		Link:        jsonFeed.HomePageURL,
+		Items:       make([]FeedItem, 0),
+	}
+
+	for _, item := range jsonFeed.Items {
+		feed.Items = append(feed.Items, FeedItem{
+			Title:       item.Title,
+			Link:        item.URL,
+			Description: item.ContentText,
+			Published:   item.DatePublished,
+		})
+	}
+
+	return feed, nil
+}
--- a/backend/utils/file.go
+++ b/backend/utils/file.go
@@ -0,0 +1,16 @@
+package utils
+
+import (
+	"path/filepath"
+	"slices"
+	"strings"
+)
+
+func IsImageFile(filename string) bool {
+	ext := strings.ToLower(filepath.Ext(filename))
+	supportedImageExts := []string{
+		".jpg", ".jpeg", ".png", ".webp",
+	}
+
+	return slices.Contains(supportedImageExts, ext)
+}
--- a/backend/utils/ip_addr.go
+++ b/backend/utils/ip_addr.go
@@ -0,0 +1,188 @@
+package utils
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/netip"
+	"net/url"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+)
+
+var documentationPrefixes = []netip.Prefix{
+	netip.MustParsePrefix("192.0.2.0/24"),    // TEST-NET-1
+	netip.MustParsePrefix("198.51.100.0/24"), // TEST-NET-2
+	netip.MustParsePrefix("203.0.113.0/24"),  // TEST-NET-3
+	netip.MustParsePrefix("2001:db8::/32"),   // IPv6 Documentation
+}
+
+func GetClientIPFromRemoteAddr(c echo.Context) string {
+	return ExtractHostFromRemoteAddr(c.Request())
+}
+
+func ExtractHostFromRemoteAddr(r *http.Request) string {
+	addr := r.RemoteAddr
+	if addr == "" {
+		return ""
+	}
+	host, _, err := net.SplitHostPort(addr)
+	if err != nil {
+		return strings.TrimSpace(addr)
+	}
+	return host
+}
+
+// IsPrivateOrReservedIP checks if the given IP address is private or reserved
+func IsPrivateOrReservedIP(ipStr string) bool {
+	ip := net.ParseIP(ipStr)
+	if ip == nil {
+		return false // Invalid IP address
+	}
+
+	// Private IP ranges:
+	// IPv4:
+	//   10.0.0.0/8
+	//   172.16.0.0/12
+	//   192.168.0.0/16
+	// IPv6:
+	//   fc00::/7 (Unique Local Addresses)
+	if ip.IsPrivate() {
+		return true
+	}
+
+	// Loopback addresses:
+	// IPv4: 127.0.0.0/8
+	// IPv6: ::1/128
+	if ip.IsLoopback() {
+		return true
+	}
+
+	// Link-local addresses:
+	// IPv4: 169.254.0.0/16
+	// IPv6: fe80::/10
+	if ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
+		return true
+	}
+
+	// Documentation addresses:
+	// IPv4:
+	//   192.0.2.0/24 (TEST-NET-1)
+	//   198.51.100.0/24 (TEST-NET-2)
+	//   203.0.113.0/24 (TEST-NET-3)
+	// IPv6:
+	//   2001:db8::/32
+	if isDocumentationIP(ip) {
+		return true
+	}
+
+	// Other reserved ranges
+	return isOtherReservedIP(ip)
+}
+
+func isDocumentationIP(ip net.IP) bool {
+	addr, ok := netip.AddrFromSlice(ip)
+	if !ok {
+		return false
+	}
+
+	// 统一处理映射地址，确保比对逻辑一致
+	addr = addr.Unmap()
+
+	for _, prefix := range documentationPrefixes {
+		if prefix.Contains(addr) {
+			return true
+		}
+	}
+	return false
+}
+
+// isOtherReservedIP checks for other reserved IP ranges
+func isOtherReservedIP(ip net.IP) bool {
+	if ip4 := ip.To4(); ip4 != nil {
+		// Other reserved IPv4 ranges:
+		//   0.0.0.0/8 - Current network (RFC 1122)
+		//   100.64.0.0/10 - Shared Address Space (RFC 6598)
+		//   192.0.0.0/24 - IETF Protocol Assignments (RFC 6890)
+		//   192.88.99.0/24 - IPv6 to IPv4 relay (RFC 3068)
+		//   198.18.0.0/15 - Network benchmark tests (RFC 2544)
+		//   240.0.0.0/4 - Reserved (RFC 1112)
+		return ip4[0] == 0 ||
+			(ip4[0] == 100 && (ip4[1]&0xc0) == 64) ||
+			(ip4[0] == 192 && ip4[1] == 0 && ip4[2] == 0) ||
+			(ip4[0] == 192 && ip4[1] == 88 && ip4[2] == 99) ||
+			(ip4[0] == 198 && (ip4[1]&0xfe) == 18) ||
+			(ip4[0]&0xf0) == 240
+	}
+
+	// Other reserved IPv6 ranges:
+	//   ::/128 - Unspecified address
+	//   ::1/128 - Loopback address (already covered by IsLoopback())
+	//   ::ffff:0:0/96 - IPv4-mapped IPv6 address
+	//   64:ff9b::/96 - IPv4-IPv6 translation (RFC 6052)
+	//   100::/64 - Discard prefix (RFC 6666)
+	//   2001::/23 - IETF Protocol Assignments
+	//   2001:2::/48 - Benchmarking (RFC 5180)
+	//   2002::/16 - 6to4 (RFC 3056)
+	//   fe80::/10 - Link-local (already covered by IsLinkLocalUnicast())
+	//   ff00::/8 - Multicast
+	return ip.Equal(net.IPv6unspecified) ||
+		ip.Equal(net.ParseIP("::ffff:0:0")) ||
+		ip.Equal(net.ParseIP("64:ff9b::")) ||
+		ip.Equal(net.ParseIP("100::")) ||
+		(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x01 && (ip[2]&0xfe) == 0) ||
+		(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x01 && ip[2] == 0x00 && ip[3] == 0x02) ||
+		(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x02) ||
+		(len(ip) == net.IPv6len && ip[0] == 0xff)
+}
+
+func IsIPv6(ipStr string) bool {
+	ip := net.ParseIP(ipStr)
+	return ip != nil && ip.To4() == nil
+}
+
+// ValidateURLForSSRF validates a URL to prevent SSRF attacks
+// It checks:
+// - URL format is valid
+// - Scheme is http or https only
+// - No credentials in URL
+// - Hostname resolves to public IP addresses only (blocks private/reserved IPs)
+func ValidateURLForSSRF(urlStr string) error {
+	// Parse and validate URL
+	parsedURL, err := url.Parse(urlStr)
+	if err != nil {
+		return fmt.Errorf("invalid URL format: %w", err)
+	}
+
+	// Validate URL scheme (only http/https allowed)
+	if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
+		return fmt.Errorf("invalid URL scheme: only http and https are allowed")
+	}
+
+	// Block URLs with userinfo (credentials)
+	if parsedURL.User != nil {
+		return fmt.Errorf("URLs with credentials are not allowed")
+	}
+
+	// Resolve hostname to IP and check if it's private/reserved
+	hostname := parsedURL.Hostname()
+	if hostname == "" {
+		return fmt.Errorf("invalid URL: missing hostname")
+	}
+
+	// Resolve the hostname to IP addresses
+	ips, err := net.LookupIP(hostname)
+	if err != nil {
+		return fmt.Errorf("failed to resolve hostname: %w", err)
+	}
+
+	// Check if any resolved IP is private or reserved
+	for _, ip := range ips {
+		if IsPrivateOrReservedIP(ip.String()) {
+			return fmt.Errorf("access to private/reserved IP addresses is not allowed")
+		}
+	}
+
+	return nil
+}
--- a/backend/utils/processor.go
+++ b/backend/utils/processor.go
@@ -0,0 +1,75 @@
+package utils
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"sync"
+)
+
+type Node struct {
+	buf *bytes.Buffer
+	son []*Node
+}
+
+func newNode() *Node {
+	return &Node{son: []*Node{}, buf: bytes.NewBufferString("")}
+}
+
+type ProcessorTree struct {
+	mu     *sync.Mutex
+	root   *Node
+	result *bytes.Buffer
+}
+
+func NewProcessorTree() *ProcessorTree {
+	return &ProcessorTree{
+		root:   newNode(),
+		mu:     &sync.Mutex{},
+		result: bytes.NewBufferString(""),
+	}
+}
+
+// 获取一个father下的节点
+func (t *ProcessorTree) GetNode(farther *Node) (*Node, error) {
+	if farther == nil {
+		return nil, errors.New("father is nil")
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	temp := newNode()
+	farther.son = append(farther.son, temp)
+	return temp, nil
+}
+
+func (t *ProcessorTree) Add(node *Node, data []byte) error {
+	if node == nil {
+		return errors.New("node is nil")
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	node.buf.Write(data)
+	return nil
+}
+
+func (t *ProcessorTree) GetResult() ([]byte, error) {
+	if err := t.getRes(t.root); err != nil {
+		return nil, err
+	}
+	return t.result.Bytes(), nil
+}
+
+func (t *ProcessorTree) getRes(node *Node) error {
+	if node == nil {
+		return nil
+	}
+	if _, err := io.Copy(t.result, node.buf); err != nil {
+		return err
+	}
+	for _, son := range node.son {
+		if err := t.getRes(son); err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/backend/utils/time.go
+++ b/backend/utils/time.go
@@ -0,0 +1,7 @@
+package utils
+
+import "time"
+
+func GetTimeHourOffset(hours int64) time.Time {
+	return time.Now().Truncate(time.Hour).Add(time.Duration(hours) * time.Hour)
+}
--- a/backend/utils/utils.go
+++ b/backend/utils/utils.go
@@ -0,0 +1,366 @@
+package utils
+
+import (
+	"bytes"
+	"context"
+	"crypto/tls"
+	"fmt"
+	"io"
+	"mime"
+	"net/http"
+	"net/url"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
+	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
+	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
+	"github.com/google/uuid"
+	"github.com/minio/minio-go/v7"
+	tiktoken_loader "github.com/pkoukk/tiktoken-go-loader"
+	"github.com/yuin/goldmark"
+	"github.com/yuin/goldmark/ast"
+	"github.com/yuin/goldmark/renderer/html"
+	"github.com/yuin/goldmark/text"
+
+	"github.com/chaitin/panda-wiki/domain"
+	"github.com/chaitin/panda-wiki/store/s3"
+)
+
+// HTTPGet send http get request
+func HTTPGet(url string) ([]byte, error) {
+	client := &http.Client{
+		Timeout: 10 * time.Second,
+		Transport: &http.Transport{
+			TLSClientConfig: &tls.Config{
+				InsecureSkipVerify: true,
+			},
+		},
+	}
+
+	resp, err := client.Get(url)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get %s: %v", url, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+	}
+
+	return io.ReadAll(resp.Body)
+}
+
+// DecodeBytes decode bytes
+func DecodeBytes(data []byte) string {
+	// try different encodings
+	encodings := []string{"utf-8", "gbk", "gb2312", "big5"}
+	for _, enc := range encodings {
+		if decoded, err := decode(data, enc); err == nil {
+			return decoded
+		}
+	}
+	return string(data)
+}
+
+// IsURLValid check if url is valid
+func IsURLValid(urlStr string) bool {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return false
+	}
+	return u.Scheme != "" && u.Host != ""
+}
+
+// URLNormalize normalize url
+func URLNormalize(urlStr string) string {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return urlStr
+	}
+
+	// remove url fragment
+	u.Fragment = ""
+
+	// normalize path
+	u.Path = path.Clean(u.Path)
+
+	// remove default port
+	if u.Port() == "80" && u.Scheme == "http" {
+		u.Host = u.Hostname()
+	} else if u.Port() == "443" && u.Scheme == "https" {
+		u.Host = u.Hostname()
+	}
+
+	return u.String()
+}
+
+func URLRemovePath(rawURL string) (string, error) {
+	parsedURL, err := url.Parse(rawURL)
+	if err != nil {
+		return "", err
+	}
+
+	parsedURL.Path = ""
+	parsedURL.RawPath = ""
+	parsedURL.RawQuery = ""
+	parsedURL.Fragment = ""
+
+	return parsedURL.String(), nil
+}
+
+// decode decode bytes with specified encoding
+func decode(data []byte, encoding string) (string, error) {
+	// need to implement encoding conversion based on actual needs
+	// use golang.org/x/text/encoding package
+	return string(data), nil
+}
+
+// GetHeaderMap get header map
+func GetHeaderMap(header string) map[string]string {
+	headerMap := make(map[string]string)
+	for _, h := range strings.Split(header, "\n") {
+		if key, value, ok := strings.Cut(h, "="); ok {
+			headerMap[key] = value
+		}
+	}
+	return headerMap
+}
+
+func UrlEncode(s string) string {
+	var encoded strings.Builder
+	for _, r := range s {
+		if r == '/' {
+			encoded.WriteRune(r)
+		} else if r < 128 {
+			encoded.WriteRune(r)
+		} else {
+			encoded.WriteString(url.QueryEscape(string(r)))
+		}
+	}
+	return encoded.String()
+}
+
+func RemoveFirstDir(path string) string {
+	// 分割路径为组成部分
+	parts := strings.Split(filepath.ToSlash(path), "/")
+
+	// 确保路径有多个部分
+	if len(parts) > 1 {
+		return filepath.Join(parts[1:]...)
+	}
+	return path
+}
+
+// RemoveURLParams 去除 URL 中的查询参数
+func RemoveURLParams(rawURL string) (string, error) {
+	// 解析 URL
+	parsedURL, err := url.Parse(rawURL)
+	if err != nil {
+		return "", err
+	}
+
+	// 清空查询字符串部分
+	parsedURL.RawQuery = ""
+
+	// 返回处理后的 URL
+	return parsedURL.String(), nil
+}
+
+func UploadImage(ctx context.Context, minioClient *s3.MinioClient, imageURL string, kbID string) (string, error) {
+	if minioClient == nil {
+		return "", fmt.Errorf("minio client is nil")
+	}
+	var data []byte
+	var contentType string
+	if strings.HasPrefix(imageURL, "http://") || strings.HasPrefix(imageURL, "https://") {
+		resp, err := http.Get(imageURL)
+		if err != nil {
+			return "", fmt.Errorf("failed to fetch image: %v", err)
+		}
+		defer resp.Body.Close()
+
+		// 检查状态码
+		if resp.StatusCode != http.StatusOK {
+			return "", fmt.Errorf("HTTP request failed with status: %s", resp.Status)
+		}
+
+		// 读取图片数据
+		data, err = io.ReadAll(resp.Body)
+		if err != nil {
+			return "", fmt.Errorf("failed to read image data: %v", err)
+		}
+
+		// 获取 Content-Type
+		contentType = resp.Header.Get("Content-Type")
+	} else {
+		// 从本地文件系统读取图片
+		var err error
+		data, err = os.ReadFile(imageURL)
+		if err != nil {
+			return "", fmt.Errorf("failed to read image file: %v", err)
+		}
+	}
+
+	// 获取图片名称（从 URL 路径中提取）
+	parsedURL, err := url.Parse(imageURL)
+	if err != nil {
+		return "", fmt.Errorf("failed to parse URL: %v", err)
+	}
+	_, filename := filepath.Split(parsedURL.Path)
+	// 解码可能的 URL 编码（如中文文件名）
+	decodedName, err := url.PathUnescape(filename)
+	if err != nil {
+		decodedName = filename // 如果解码失败，使用原始名称
+	}
+
+	ext := strings.ToLower(filepath.Ext(decodedName))
+	if ext == "" {
+		contentType = mime.TypeByExtension(ext)
+	}
+	if contentType == "" {
+		contentType = "application/octet-stream"
+	}
+	imgName := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
+
+	if _, err := minioClient.PutObject(
+		ctx,
+		domain.Bucket,
+		imgName,
+		bytes.NewReader(data),
+		int64(len(data)),
+		minio.PutObjectOptions{
+			ContentType: contentType,
+			UserMetadata: map[string]string{
+				"originalname": decodedName,
+			},
+		},
+	); err != nil {
+		return "", fmt.Errorf("failed to upload image to MinIO: %v", err)
+	}
+	return fmt.Sprintf("/%s/%s", domain.Bucket, imgName), nil
+}
+
+func GetTitleFromMarkdown(markdown string) string {
+	title := strings.TrimSpace(markdown)
+	runes := []rune(title)
+	if len(runes) > 60 {
+		return string(runes[:60])
+	}
+	return title
+}
+
+func ExchangeMarkDownImageUrl(
+	ctx context.Context,
+	mdContent []byte,
+	getUrl func(ctx context.Context, originUrl *string) (string, error),
+) (string, error) {
+	md := goldmark.New(
+		goldmark.WithRendererOptions(
+			html.WithHardWraps(),
+		),
+	)
+	reader := text.NewReader(mdContent)
+	doc := md.Parser().Parse(reader)
+
+	// 1. 收集图片节点和原始URL
+	type imgTask struct {
+		node   *ast.Image
+		rawUrl string
+	}
+	var tasks []imgTask
+
+	if err := ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
+		if !entering {
+			return ast.WalkContinue, nil
+		}
+		if img, ok := n.(*ast.Image); ok {
+			rawUrl := string(img.Destination)
+			tasks = append(tasks, imgTask{img, rawUrl})
+		}
+		return ast.WalkContinue, nil
+	}); err != nil {
+		return "", err
+	}
+
+	// 2. 并发获取新URL
+	type result struct {
+		idx    int
+		newUrl string
+		err    error
+	}
+
+	results := make(chan result, len(tasks))
+	var wg sync.WaitGroup
+
+	for i, t := range tasks {
+		wg.Add(1)
+		go func(idx int, rawUrl string) {
+			defer wg.Done()
+			newUrl, err := getUrl(ctx, &rawUrl)
+			results <- result{idx, newUrl, err}
+		}(i, t.rawUrl)
+	}
+
+	// 关闭结果通道当所有goroutine完成时
+	go func() {
+		wg.Wait()
+		close(results)
+	}()
+
+	// 3. 处理结果
+	for res := range results {
+		if res.err != nil {
+			return "", res.err
+		}
+		tasks[res.idx].node.Destination = []byte(res.newUrl)
+	}
+
+	// 4. 渲染Markdown
+	var buf bytes.Buffer
+	if err := md.Renderer().Render(&buf, mdContent, doc); err != nil {
+		return "", err
+	}
+
+	// 5. 转换并返回字符串
+	conv := converter.NewConverter(
+		converter.WithPlugins(
+			base.NewBasePlugin(),
+			commonmark.NewCommonmarkPlugin(
+				commonmark.WithStrongDelimiter("__"),
+			),
+		),
+	)
+	converted, err := conv.ConvertReader(&buf)
+	if err != nil {
+		return "", err
+	}
+	return string(converted), nil
+}
+
+type Localloader struct{}
+
+func (m *Localloader) LoadTiktokenBpe(_ string) (map[string]int, error) {
+	a := tiktoken_loader.NewOfflineLoader()
+	res, err := a.LoadTiktokenBpe("cl100k_base.tiktoken")
+	return res, err
+}
+
+func GetFileNameWithoutExt(path string) string {
+	filename := filepath.Base(path)
+	return strings.TrimSuffix(filename, filepath.Ext(filename))
+}
+
+func IsUUID(s string) bool {
+	_, err := uuid.Parse(s)
+	return err == nil
+}
+
+func IsLikelyHTML(text string) bool {
+	trimContent := strings.TrimSpace(text)
+	return strings.HasPrefix(trimContent, "<") && strings.HasSuffix(trimContent, ">")
+}