init push
This commit is contained in:
182
backend/utils/DFA.go
Normal file
182
backend/utils/DFA.go
Normal file
@@ -0,0 +1,182 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var (
|
||||
dfaInstance map[string]*DFAInstance
|
||||
mu sync.RWMutex
|
||||
)
|
||||
|
||||
type DFAInstance struct {
|
||||
DFA *DFA
|
||||
BuffSize int
|
||||
}
|
||||
|
||||
// GetDFA returns the singleton instance of DFA
|
||||
func GetDFA(kbID string) *DFAInstance {
|
||||
mu.RLock()
|
||||
defer mu.RUnlock()
|
||||
return dfaInstance[kbID]
|
||||
}
|
||||
|
||||
// InitDFA Initialize a new DFA. --> this func used by pro
|
||||
func InitDFA(kbID string, words []string) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
newDFA := &DFA{
|
||||
Root: NewTrieNode(),
|
||||
}
|
||||
var BuffSize int // 默认为0
|
||||
for _, word := range words {
|
||||
newDFA.AddWord(word)
|
||||
if BuffSize < len([]rune(word)) {
|
||||
BuffSize = len([]rune(word))
|
||||
}
|
||||
}
|
||||
if dfaInstance == nil {
|
||||
dfaInstance = make(map[string]*DFAInstance)
|
||||
}
|
||||
dfaInstance[kbID] = &DFAInstance{
|
||||
DFA: newDFA,
|
||||
BuffSize: BuffSize,
|
||||
}
|
||||
}
|
||||
|
||||
// TrieNode Define the nodes of DFA
|
||||
type TrieNode struct {
|
||||
Children map[rune]*TrieNode
|
||||
IsEnd bool
|
||||
}
|
||||
|
||||
// NewTrieNode Create a new Trie node
|
||||
func NewTrieNode() *TrieNode {
|
||||
return &TrieNode{
|
||||
Children: make(map[rune]*TrieNode),
|
||||
IsEnd: false,
|
||||
}
|
||||
}
|
||||
|
||||
// DFA The structure contains the root node of the DFA
|
||||
type DFA struct {
|
||||
Root *TrieNode
|
||||
}
|
||||
|
||||
// AddWord Add sensitive words to DFA
|
||||
func (d *DFA) AddWord(word string) {
|
||||
node := d.Root
|
||||
for _, char := range word {
|
||||
if _, exists := node.Children[char]; !exists {
|
||||
node.Children[char] = NewTrieNode()
|
||||
}
|
||||
node = node.Children[char]
|
||||
}
|
||||
node.IsEnd = true
|
||||
}
|
||||
|
||||
// UpdateOldWord update old word
|
||||
func (d *DFA) UpdateOldWord(oldWord, newWord string) {
|
||||
d.DeleteWord(oldWord)
|
||||
d.AddWord(newWord)
|
||||
}
|
||||
|
||||
// DeleteWord delete word
|
||||
func (d *DFA) DeleteWord(word string) bool {
|
||||
result := []rune(word)
|
||||
// 辅助函数用于递归删除节点
|
||||
var deleteNode func(node *TrieNode, index int) bool
|
||||
deleteNode = func(node *TrieNode, index int) bool {
|
||||
if index == len(result) {
|
||||
// 如果该词不存在,直接返回
|
||||
if !node.IsEnd {
|
||||
return false
|
||||
}
|
||||
// 清除该词的结束标记
|
||||
node.IsEnd = false
|
||||
// 如果该节点没有子节点,可以删除
|
||||
return len(node.Children) == 0
|
||||
}
|
||||
|
||||
char := result[index]
|
||||
child, exists := node.Children[char]
|
||||
if !exists {
|
||||
return false // 如果路径不存在,则不做任何操作
|
||||
}
|
||||
|
||||
// 递归删除子节点
|
||||
shouldDeleteChild := deleteNode(child, index+1)
|
||||
if shouldDeleteChild {
|
||||
// 删除当前节点的子节点
|
||||
delete(node.Children, char)
|
||||
// 如果当前节点没有其他子节点且不是词尾节点,返回 true
|
||||
return len(node.Children) == 0 && !node.IsEnd
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// 调用递归函数删除指定的词
|
||||
return deleteNode(d.Root, 0)
|
||||
}
|
||||
|
||||
// DeleteWordBatch delete word batch
|
||||
func (d *DFA) DeleteWordBatch(words []string) {
|
||||
wg := sync.WaitGroup{}
|
||||
for _, word := range words {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
d.DeleteWord(word)
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// Filter the input text and replace sensitive words
|
||||
func (d *DFA) Filter(text string) string {
|
||||
result := []rune(text) // 转化为rune
|
||||
for i := 0; i < len(result); i++ { // 外层循环,遍历每个字符作为起始点
|
||||
node := d.Root
|
||||
j := i
|
||||
for j < len(result) { // 内层循环,尝试匹配敏感词
|
||||
if nextNode, exists := node.Children[result[j]]; exists { // 如果当前字符在子节点中存在
|
||||
node = nextNode // 下移
|
||||
if node.IsEnd { // 是否为结尾,即匹配到敏感词,替换为*
|
||||
for k := i; k <= j; k++ {
|
||||
result[k] = '🚫'
|
||||
}
|
||||
}
|
||||
j++ // next char
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return string(result)
|
||||
}
|
||||
|
||||
// Check if the input text contains sensitive words
|
||||
func (d *DFA) Check(text string) error {
|
||||
result := []rune(text)
|
||||
for i := 0; i < len(result); {
|
||||
node := d.Root
|
||||
start := i
|
||||
matched := false
|
||||
for j := i; j < len(result); j++ {
|
||||
char := result[j]
|
||||
if nextNode, exists := node.Children[char]; exists {
|
||||
node = nextNode
|
||||
if node.IsEnd {
|
||||
return errors.New("包含敏感词: " + string(result[start:j+1]))
|
||||
}
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if !matched {
|
||||
i++
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
430
backend/utils/epub.go
Normal file
430
backend/utils/epub.go
Normal file
@@ -0,0 +1,430 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
|
||||
"github.com/chaitin/panda-wiki/domain"
|
||||
"github.com/chaitin/panda-wiki/log"
|
||||
"github.com/chaitin/panda-wiki/store/s3"
|
||||
"github.com/google/uuid"
|
||||
"github.com/minio/minio-go/v7"
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
||||
type EpubConverter struct {
|
||||
logger *log.Logger
|
||||
mu sync.Mutex
|
||||
minioClient *s3.MinioClient
|
||||
// relative path -> oss path
|
||||
resources map[string]string
|
||||
// id -> relative path
|
||||
resourcesIdMap map[string]Item
|
||||
// relative path -> id
|
||||
relativePath map[string]string
|
||||
}
|
||||
|
||||
func NewEpubConverter(logger *log.Logger, minio *s3.MinioClient) *EpubConverter {
|
||||
return &EpubConverter{
|
||||
logger: logger.WithModule("epubConverter"),
|
||||
minioClient: minio,
|
||||
resources: make(map[string]string),
|
||||
resourcesIdMap: make(map[string]Item),
|
||||
relativePath: make(map[string]string),
|
||||
}
|
||||
}
|
||||
|
||||
func (e *EpubConverter) Convert(ctx context.Context, kbID string, data *multipart.FileHeader) (string, []byte, error) {
|
||||
reader, err := data.Open()
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
zipReader, err := zip.NewReader(reader, data.Size)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
if err := valid(zipReader); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
// read ./path/to/content.opf
|
||||
var p *Package
|
||||
if p, err = getOpf(zipReader); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
for _, item := range p.Manifest.Items {
|
||||
e.resourcesIdMap[item.ID] = item
|
||||
e.relativePath[item.Href] = item.ID
|
||||
}
|
||||
|
||||
// resolve resource file
|
||||
if err := e.uploadFile(ctx, kbID, zipReader); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
conv := converter.NewConverter(
|
||||
converter.WithPlugins(
|
||||
base.NewBasePlugin(),
|
||||
commonmark.NewCommonmarkPlugin(
|
||||
commonmark.WithStrongDelimiter("__"),
|
||||
),
|
||||
),
|
||||
)
|
||||
conv.Register.TagType("a", converter.TagTypeRemove, converter.PriorityStandard)
|
||||
|
||||
res := make(map[string]*bytes.Buffer)
|
||||
var toc []map[string]string
|
||||
for _, zipfile := range zipReader.File {
|
||||
ext := strings.ToLower(filepath.Ext(zipfile.Name))
|
||||
if ext == ".ncx" {
|
||||
file, err := zipfile.Open()
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
toc, err = ParseNCX(file)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
}
|
||||
file, err := zipfile.Open()
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
htmlStr, err := io.ReadAll(file)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
mdStr, err := conv.ConvertString((string(htmlStr)))
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
e.logger.Info("convert File", "file name", clearFileName(zipfile.Name))
|
||||
res[clearFileName(zipfile.Name)] = bytes.NewBufferString(mdStr)
|
||||
}
|
||||
// page sequence
|
||||
result := bytes.NewBuffer(nil)
|
||||
for _, href := range p.Guide.References {
|
||||
if r, ok := res[clearFileName(href.Href)]; ok {
|
||||
if _, err := io.Copy(result, r); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
result.WriteString("\n\n")
|
||||
}
|
||||
}
|
||||
result.WriteString("# 目录\n\n")
|
||||
for _, v := range toc {
|
||||
fmt.Fprintf(result, "- [%s](#%s)\n", v["title"], v["playOrder"])
|
||||
}
|
||||
temp := make(map[string]string)
|
||||
for _, v := range toc {
|
||||
temp[v["src"]] = v["playOrder"]
|
||||
}
|
||||
for _, itemRef := range p.Spine.ItemRefs {
|
||||
title := temp[e.resourcesIdMap[itemRef.IDRef].Href]
|
||||
e.logger.Debug("add File", "file name", clearFileName(e.resourcesIdMap[itemRef.IDRef].Href))
|
||||
if r, ok := res[clearFileName(e.resourcesIdMap[itemRef.IDRef].Href)]; ok {
|
||||
result.WriteString("<span id=" + title + "></span>\n\n")
|
||||
if _, err := io.Copy(result, r); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
result.WriteString("\n\n")
|
||||
}
|
||||
}
|
||||
str, err := e.exchangeUrl(ctx, result.String())
|
||||
return p.Metadata.Title, str, err
|
||||
}
|
||||
|
||||
func clearFileName(str string) string {
|
||||
str = filepath.Base(str)
|
||||
return strings.Split(str, "#")[0]
|
||||
}
|
||||
|
||||
func (e *EpubConverter) uploadFile(ctx context.Context, kbID string, zipReader *zip.Reader) error {
|
||||
var wg sync.WaitGroup
|
||||
errCh := make(chan error, len(zipReader.File))
|
||||
sem := semaphore.NewWeighted(10) // 控制并发数为10
|
||||
|
||||
for _, f := range zipReader.File {
|
||||
if isSkippableFile(f.Name) {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := sem.Acquire(ctx, 1); err != nil {
|
||||
return err // 如果获取信号量失败(如context取消),直接返回错误
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
|
||||
go func(f *zip.File) {
|
||||
defer func() {
|
||||
sem.Release(1)
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
if err := e.processFile(ctx, f, kbID); err != nil {
|
||||
errCh <- err
|
||||
}
|
||||
}(f)
|
||||
}
|
||||
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(errCh)
|
||||
}()
|
||||
|
||||
return <-errCh // 返回第一个错误(或 nil)
|
||||
}
|
||||
|
||||
func (e *EpubConverter) processFile(ctx context.Context, f *zip.File, kbID string) error {
|
||||
file, err := f.Open()
|
||||
if err != nil {
|
||||
return fmt.Errorf("打开文件 %s 失败: %v", f.Name, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(f.Name))
|
||||
ossPath := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
|
||||
|
||||
e.mu.Lock()
|
||||
e.resources[f.Name] = fmt.Sprintf("/%s/%s", domain.Bucket, ossPath)
|
||||
e.mu.Unlock()
|
||||
_, err = e.minioClient.PutObject(
|
||||
ctx,
|
||||
domain.Bucket,
|
||||
ossPath,
|
||||
file,
|
||||
f.FileInfo().Size(),
|
||||
minio.PutObjectOptions{
|
||||
ContentType: e.resourcesIdMap[e.relativePath[f.Name]].MediaType,
|
||||
UserMetadata: map[string]string{"originalname": filepath.Base(f.Name)},
|
||||
},
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func isSkippableFile(name string) bool {
|
||||
skipExts := map[string]bool{".html": true, ".css": true, ".xml": true /* 其他扩展名 */}
|
||||
return name == "META-INF/container.xml" || name == "mimetype" || skipExts[filepath.Ext(name)]
|
||||
}
|
||||
|
||||
func (e *EpubConverter) exchangeUrl(ctx context.Context, content string) ([]byte, error) {
|
||||
// 将字符串转换为字节切片
|
||||
mdContent := []byte(content)
|
||||
|
||||
// 定义 getUrl 函数,使用资源映射表替换 URL
|
||||
getUrl := func(ctx context.Context, originUrl *string) (string, error) {
|
||||
if originUrl == nil {
|
||||
return "", fmt.Errorf("originUrl is nil")
|
||||
}
|
||||
|
||||
// 查找资源映射
|
||||
if newUrl, exists := e.resources[*originUrl]; exists {
|
||||
return newUrl, nil
|
||||
}
|
||||
|
||||
// 未找到映射,返回原始 URL
|
||||
return *originUrl, nil
|
||||
}
|
||||
|
||||
// 使用 ExchangeMarkDownImageUrl 处理 Markdown
|
||||
processedContent, err := ExchangeMarkDownImageUrl(
|
||||
ctx,
|
||||
mdContent,
|
||||
getUrl,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to exchange URLs: %w", err)
|
||||
}
|
||||
|
||||
return []byte(processedContent), nil
|
||||
}
|
||||
|
||||
// 获取 <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
func getFullPath(zipReader *zip.Reader) (string, error) {
|
||||
// 定义 XML 结构体来匹配 container.xml 的内容
|
||||
type Rootfile struct {
|
||||
FullPath string `xml:"full-path,attr"`
|
||||
MediaType string `xml:"media-type,attr"`
|
||||
}
|
||||
type Rootfiles struct {
|
||||
Rootfile []Rootfile `xml:"rootfile"`
|
||||
}
|
||||
|
||||
type Container struct {
|
||||
XMLName xml.Name `xml:"container"`
|
||||
Xmlns string `xml:"xmlns,attr"`
|
||||
Version string `xml:"version,attr"`
|
||||
Rootfiles Rootfiles `xml:"rootfiles"`
|
||||
}
|
||||
|
||||
for _, f := range zipReader.File {
|
||||
if f.Name == "META-INF/container.xml" {
|
||||
// parse container.xml
|
||||
r, err := f.Open()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer r.Close()
|
||||
de := xml.NewDecoder(r)
|
||||
var c Container
|
||||
if err := de.Decode(&c); err != nil {
|
||||
return "", fmt.Errorf("failed to decode container.xml: %w", err)
|
||||
}
|
||||
if c.Rootfiles.Rootfile[0].FullPath == "" {
|
||||
return "", errors.New("full-path not found in container.xml")
|
||||
}
|
||||
return c.Rootfiles.Rootfile[0].FullPath, nil
|
||||
}
|
||||
}
|
||||
return "", errors.New("container.xml not found")
|
||||
}
|
||||
|
||||
func valid(zipReader *zip.Reader) error {
|
||||
for _, f := range zipReader.File {
|
||||
if f.Name == "mimetype" {
|
||||
r, err := f.Open()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer r.Close()
|
||||
var buf bytes.Buffer
|
||||
if _, err := buf.ReadFrom(r); err != nil {
|
||||
return fmt.Errorf("failed to read mimetype: %w", err)
|
||||
}
|
||||
if buf.String() != "application/epub+zip" {
|
||||
return errors.New("invalid mimetype")
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Package represents the root element of the OPF file
|
||||
type Package struct {
|
||||
XMLName xml.Name `xml:"package"`
|
||||
Spine Spine `xml:"spine"` // 内容
|
||||
Guide Guide `xml:"guide"` // 封面
|
||||
Manifest struct { // 资源清单
|
||||
Items []Item `xml:"item"` // 资源
|
||||
} `xml:"manifest"`
|
||||
Metadata struct { // 元数据
|
||||
Title string `xml:"dc:title"` // 标题
|
||||
} `xml:"metadata"`
|
||||
}
|
||||
|
||||
// Spine represents the spine section of the OPF file
|
||||
type Spine struct {
|
||||
Toc string `xml:"toc,attr"`
|
||||
ItemRefs []ItemRef `xml:"itemref"`
|
||||
}
|
||||
|
||||
// ItemRef represents an itemref in the spine section
|
||||
type ItemRef struct {
|
||||
IDRef string `xml:"idref,attr"`
|
||||
}
|
||||
|
||||
// Guide represents the guide section of the OPF file
|
||||
type Guide struct {
|
||||
References []Reference `xml:"reference"`
|
||||
}
|
||||
|
||||
// Reference represents a reference in the guide section
|
||||
type Reference struct {
|
||||
Href string `xml:"href,attr"`
|
||||
Title string `xml:"title,attr"`
|
||||
Type string `xml:"type,attr"`
|
||||
}
|
||||
|
||||
// Item represents an item in the manifest section
|
||||
type Item struct {
|
||||
ID string `xml:"id,attr"`
|
||||
Href string `xml:"href,attr"`
|
||||
MediaType string `xml:"media-type,attr"`
|
||||
}
|
||||
|
||||
func getOpf(zipReader *zip.Reader) (*Package, error) {
|
||||
// read ./META_INF/container.xml
|
||||
opfPath, err := getFullPath(zipReader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// read ./OEBPS/content.opf
|
||||
for _, f := range zipReader.File {
|
||||
if f.Name == opfPath {
|
||||
r, err := f.Open()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
var p Package
|
||||
de := xml.NewDecoder(r)
|
||||
if err := de.Decode(&p); err != nil {
|
||||
return nil, fmt.Errorf("解码OPF文件失败: %v", err)
|
||||
}
|
||||
return &p, nil
|
||||
}
|
||||
}
|
||||
return nil, errors.New("content.opf not found")
|
||||
}
|
||||
|
||||
// NCX 结构体定义
|
||||
type NCX struct {
|
||||
XMLName xml.Name `xml:"ncx"`
|
||||
NavMap NavMap `xml:"navMap"`
|
||||
}
|
||||
|
||||
type NavMap struct {
|
||||
NavPoints []NavPoint `xml:"navPoint"`
|
||||
}
|
||||
|
||||
type NavPoint struct {
|
||||
ID string `xml:"id,attr"`
|
||||
PlayOrder string `xml:"playOrder,attr"`
|
||||
NavLabel NavLabel `xml:"navLabel"`
|
||||
Content Content `xml:"content"`
|
||||
}
|
||||
|
||||
type NavLabel struct {
|
||||
Text string `xml:"text"`
|
||||
}
|
||||
|
||||
type Content struct {
|
||||
Src string `xml:"src,attr"`
|
||||
}
|
||||
|
||||
// ParseNCX 解析 NCX 文件并返回目录信息
|
||||
func ParseNCX(r io.Reader) ([]map[string]string, error) {
|
||||
var ncx NCX
|
||||
if err := xml.NewDecoder(r).Decode(&ncx); err != nil {
|
||||
return nil, fmt.Errorf("解析NCX失败: %v", err)
|
||||
}
|
||||
|
||||
var toc []map[string]string
|
||||
for _, np := range ncx.NavMap.NavPoints {
|
||||
entry := map[string]string{
|
||||
"id": np.ID,
|
||||
"playOrder": np.PlayOrder,
|
||||
"title": np.NavLabel.Text,
|
||||
"src": np.Content.Src,
|
||||
}
|
||||
toc = append(toc, entry)
|
||||
}
|
||||
|
||||
return toc, nil
|
||||
}
|
||||
239
backend/utils/feed.go
Normal file
239
backend/utils/feed.go
Normal file
@@ -0,0 +1,239 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// FeedItem represents a single item in any feed format
|
||||
// FeedItem 表示任意Feed格式中的单个条目
|
||||
// 字段说明:
|
||||
// Title: 条目标题
|
||||
// Link: 条目链接(URL)
|
||||
// Description: 条目描述内容
|
||||
// Published: 发布时间(字符串格式,具体格式由Feed源决定)
|
||||
type FeedItem struct {
|
||||
Title string // 条目标题
|
||||
Link string // 条目链接URL
|
||||
Description string // 条目描述内容
|
||||
Published string // 发布时间(字符串格式)
|
||||
}
|
||||
|
||||
// Feed represents a generic feed structure
|
||||
type Feed struct {
|
||||
Title string
|
||||
Description string
|
||||
Link string
|
||||
Items []FeedItem
|
||||
}
|
||||
|
||||
// cleanXMLContent removes illegal XML characters from the content
|
||||
func cleanXMLContent(content string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
// Check if the character is a valid XML character
|
||||
// XML 1.0 spec: https://www.w3.org/TR/xml/#charsets
|
||||
if r == 0x9 || r == 0xA || r == 0xD || (r >= 0x20 && r <= 0xD7FF) || (r >= 0xE000 && r <= 0xFFFD) || (r >= 0x10000 && r <= 0x10FFFF) {
|
||||
return r
|
||||
}
|
||||
return -1 // Remove invalid characters
|
||||
}, content)
|
||||
}
|
||||
|
||||
// ParseFeed 解析指定URL的Feed内容,返回通用Feed结构
|
||||
// 参数:
|
||||
// url: 要解析的Feed内容URL
|
||||
// 返回值:
|
||||
// *Feed: 解析后的通用Feed结构(包含标题、描述、链接和条目列表)
|
||||
// error: 解析过程中出现的错误(网络错误、格式不支持等)
|
||||
func ParseFeed(url string) (*Feed, error) {
|
||||
// Get feed content
|
||||
content, err := HTTPGet(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get feed content: %v", err)
|
||||
}
|
||||
|
||||
// Decode content
|
||||
decoded := DecodeBytes(content)
|
||||
// Clean illegal XML characters
|
||||
cleaned := cleanXMLContent(decoded)
|
||||
decodedBytes := []byte(cleaned)
|
||||
|
||||
// Try to detect feed format and parse accordingly
|
||||
if strings.Contains(cleaned, "<rss") {
|
||||
return parseRSS(decodedBytes)
|
||||
} else if strings.Contains(cleaned, "<feed") {
|
||||
return parseAtom(decodedBytes)
|
||||
} else if strings.Contains(cleaned, "\"version\":") {
|
||||
return parseJSONFeed(decodedBytes)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported feed format")
|
||||
}
|
||||
|
||||
// parseRSS 解析RSS格式(如RSS 2.0)的内容
|
||||
// 参数:content - RSS格式的字节内容
|
||||
// 返回值:解析后的通用Feed结构或错误
|
||||
// 注意:处理链接时按以下优先级获取:link标签的href属性 > link标签文本值 > Atom扩展链接 > Guid(永久链接)
|
||||
func parseRSS(content []byte) (*Feed, error) {
|
||||
type RSSFeed struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Channel struct {
|
||||
Title string `xml:"title"`
|
||||
Description string `xml:"description"`
|
||||
Link string `xml:"link"`
|
||||
AtomLink struct {
|
||||
Href string `xml:"href,attr"`
|
||||
} `xml:"http://www.w3.org/2005/Atom link"`
|
||||
Items []struct {
|
||||
Title string `xml:"title"`
|
||||
Links []struct {
|
||||
Href string `xml:"href,attr"`
|
||||
Value string `xml:",chardata"`
|
||||
} `xml:"link"`
|
||||
Description string `xml:"description"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Guid struct {
|
||||
IsPermaLink string `xml:"isPermaLink,attr"`
|
||||
Value string `xml:",chardata"`
|
||||
} `xml:"guid"`
|
||||
AtomLink struct {
|
||||
Href string `xml:"href,attr"`
|
||||
} `xml:"http://www.w3.org/2005/Atom link"`
|
||||
} `xml:"item"`
|
||||
} `xml:"channel"`
|
||||
}
|
||||
|
||||
var rssFeed RSSFeed
|
||||
if err := xml.Unmarshal(content, &rssFeed); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse RSS: %v", err)
|
||||
}
|
||||
|
||||
feed := &Feed{
|
||||
Title: rssFeed.Channel.Title,
|
||||
Description: rssFeed.Channel.Description,
|
||||
Link: rssFeed.Channel.Link,
|
||||
Items: make([]FeedItem, 0),
|
||||
}
|
||||
|
||||
for _, item := range rssFeed.Channel.Items {
|
||||
feedItem := FeedItem{
|
||||
Title: item.Title,
|
||||
Description: item.Description,
|
||||
Published: item.PubDate,
|
||||
}
|
||||
|
||||
// Try to get link from various sources in order of preference
|
||||
if len(item.Links) > 0 {
|
||||
// Try href attribute first, then value
|
||||
if item.Links[0].Href != "" {
|
||||
feedItem.Link = item.Links[0].Href
|
||||
} else if item.Links[0].Value != "" {
|
||||
feedItem.Link = item.Links[0].Value
|
||||
}
|
||||
} else if item.AtomLink.Href != "" {
|
||||
feedItem.Link = item.AtomLink.Href
|
||||
} else if item.Guid.Value != "" && (item.Guid.IsPermaLink == "" || item.Guid.IsPermaLink == "true") {
|
||||
feedItem.Link = item.Guid.Value
|
||||
}
|
||||
|
||||
feed.Items = append(feed.Items, feedItem)
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
|
||||
// parseAtom 解析Atom 1.0格式的内容
|
||||
// 参数:content - Atom格式的字节内容
|
||||
// 返回值:解析后的通用Feed结构或错误
|
||||
// 注意:Feed链接取第一个link元素的href属性(建议优先使用rel="alternate"的链接)
|
||||
func parseAtom(content []byte) (*Feed, error) {
|
||||
type AtomFeed struct {
|
||||
XMLName xml.Name `xml:"feed"`
|
||||
Title string `xml:"title"`
|
||||
Subtitle string `xml:"subtitle"`
|
||||
Link []struct {
|
||||
Href string `xml:"href,attr"`
|
||||
} `xml:"link"`
|
||||
Entries []struct {
|
||||
Title string `xml:"title"`
|
||||
Link []struct {
|
||||
Href string `xml:"href,attr"`
|
||||
} `xml:"link"`
|
||||
Summary string `xml:"summary"`
|
||||
Updated string `xml:"updated"`
|
||||
} `xml:"entry"`
|
||||
}
|
||||
|
||||
var atomFeed AtomFeed
|
||||
if err := xml.Unmarshal(content, &atomFeed); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Atom: %v", err)
|
||||
}
|
||||
|
||||
feed := &Feed{
|
||||
Title: atomFeed.Title,
|
||||
Description: atomFeed.Subtitle,
|
||||
Items: make([]FeedItem, 0),
|
||||
}
|
||||
|
||||
if len(atomFeed.Link) > 0 {
|
||||
feed.Link = atomFeed.Link[0].Href
|
||||
}
|
||||
|
||||
for _, entry := range atomFeed.Entries {
|
||||
item := FeedItem{
|
||||
Title: entry.Title,
|
||||
Description: entry.Summary,
|
||||
Published: entry.Updated,
|
||||
}
|
||||
if len(entry.Link) > 0 {
|
||||
item.Link = entry.Link[0].Href
|
||||
}
|
||||
feed.Items = append(feed.Items, item)
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
|
||||
// parseJSONFeed 解析JSON Feed格式(如1.1版本)的内容
|
||||
// 参数:content - JSON Feed格式的字节内容
|
||||
// 返回值:解析后的通用Feed结构或错误
|
||||
// 字段映射:home_page_url -> Feed.Link; date_published -> FeedItem.Published
|
||||
func parseJSONFeed(content []byte) (*Feed, error) {
|
||||
type JSONFeed struct {
|
||||
Version string `json:"version"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
HomePageURL string `json:"home_page_url"`
|
||||
Items []struct {
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url"`
|
||||
ContentText string `json:"content_text"`
|
||||
DatePublished string `json:"date_published"`
|
||||
} `json:"items"`
|
||||
}
|
||||
|
||||
var jsonFeed JSONFeed
|
||||
if err := json.Unmarshal(content, &jsonFeed); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse JSON Feed: %v", err)
|
||||
}
|
||||
|
||||
feed := &Feed{
|
||||
Title: jsonFeed.Title,
|
||||
Description: jsonFeed.Description,
|
||||
Link: jsonFeed.HomePageURL,
|
||||
Items: make([]FeedItem, 0),
|
||||
}
|
||||
|
||||
for _, item := range jsonFeed.Items {
|
||||
feed.Items = append(feed.Items, FeedItem{
|
||||
Title: item.Title,
|
||||
Link: item.URL,
|
||||
Description: item.ContentText,
|
||||
Published: item.DatePublished,
|
||||
})
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
16
backend/utils/file.go
Normal file
16
backend/utils/file.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func IsImageFile(filename string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
supportedImageExts := []string{
|
||||
".jpg", ".jpeg", ".png", ".webp",
|
||||
}
|
||||
|
||||
return slices.Contains(supportedImageExts, ext)
|
||||
}
|
||||
188
backend/utils/ip_addr.go
Normal file
188
backend/utils/ip_addr.go
Normal file
@@ -0,0 +1,188 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/netip"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
)
|
||||
|
||||
var documentationPrefixes = []netip.Prefix{
|
||||
netip.MustParsePrefix("192.0.2.0/24"), // TEST-NET-1
|
||||
netip.MustParsePrefix("198.51.100.0/24"), // TEST-NET-2
|
||||
netip.MustParsePrefix("203.0.113.0/24"), // TEST-NET-3
|
||||
netip.MustParsePrefix("2001:db8::/32"), // IPv6 Documentation
|
||||
}
|
||||
|
||||
func GetClientIPFromRemoteAddr(c echo.Context) string {
|
||||
return ExtractHostFromRemoteAddr(c.Request())
|
||||
}
|
||||
|
||||
func ExtractHostFromRemoteAddr(r *http.Request) string {
|
||||
addr := r.RemoteAddr
|
||||
if addr == "" {
|
||||
return ""
|
||||
}
|
||||
host, _, err := net.SplitHostPort(addr)
|
||||
if err != nil {
|
||||
return strings.TrimSpace(addr)
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
// IsPrivateOrReservedIP checks if the given IP address is private or reserved
|
||||
func IsPrivateOrReservedIP(ipStr string) bool {
|
||||
ip := net.ParseIP(ipStr)
|
||||
if ip == nil {
|
||||
return false // Invalid IP address
|
||||
}
|
||||
|
||||
// Private IP ranges:
|
||||
// IPv4:
|
||||
// 10.0.0.0/8
|
||||
// 172.16.0.0/12
|
||||
// 192.168.0.0/16
|
||||
// IPv6:
|
||||
// fc00::/7 (Unique Local Addresses)
|
||||
if ip.IsPrivate() {
|
||||
return true
|
||||
}
|
||||
|
||||
// Loopback addresses:
|
||||
// IPv4: 127.0.0.0/8
|
||||
// IPv6: ::1/128
|
||||
if ip.IsLoopback() {
|
||||
return true
|
||||
}
|
||||
|
||||
// Link-local addresses:
|
||||
// IPv4: 169.254.0.0/16
|
||||
// IPv6: fe80::/10
|
||||
if ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
|
||||
return true
|
||||
}
|
||||
|
||||
// Documentation addresses:
|
||||
// IPv4:
|
||||
// 192.0.2.0/24 (TEST-NET-1)
|
||||
// 198.51.100.0/24 (TEST-NET-2)
|
||||
// 203.0.113.0/24 (TEST-NET-3)
|
||||
// IPv6:
|
||||
// 2001:db8::/32
|
||||
if isDocumentationIP(ip) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Other reserved ranges
|
||||
return isOtherReservedIP(ip)
|
||||
}
|
||||
|
||||
func isDocumentationIP(ip net.IP) bool {
|
||||
addr, ok := netip.AddrFromSlice(ip)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
// 统一处理映射地址,确保比对逻辑一致
|
||||
addr = addr.Unmap()
|
||||
|
||||
for _, prefix := range documentationPrefixes {
|
||||
if prefix.Contains(addr) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// isOtherReservedIP checks for other reserved IP ranges
|
||||
func isOtherReservedIP(ip net.IP) bool {
|
||||
if ip4 := ip.To4(); ip4 != nil {
|
||||
// Other reserved IPv4 ranges:
|
||||
// 0.0.0.0/8 - Current network (RFC 1122)
|
||||
// 100.64.0.0/10 - Shared Address Space (RFC 6598)
|
||||
// 192.0.0.0/24 - IETF Protocol Assignments (RFC 6890)
|
||||
// 192.88.99.0/24 - IPv6 to IPv4 relay (RFC 3068)
|
||||
// 198.18.0.0/15 - Network benchmark tests (RFC 2544)
|
||||
// 240.0.0.0/4 - Reserved (RFC 1112)
|
||||
return ip4[0] == 0 ||
|
||||
(ip4[0] == 100 && (ip4[1]&0xc0) == 64) ||
|
||||
(ip4[0] == 192 && ip4[1] == 0 && ip4[2] == 0) ||
|
||||
(ip4[0] == 192 && ip4[1] == 88 && ip4[2] == 99) ||
|
||||
(ip4[0] == 198 && (ip4[1]&0xfe) == 18) ||
|
||||
(ip4[0]&0xf0) == 240
|
||||
}
|
||||
|
||||
// Other reserved IPv6 ranges:
|
||||
// ::/128 - Unspecified address
|
||||
// ::1/128 - Loopback address (already covered by IsLoopback())
|
||||
// ::ffff:0:0/96 - IPv4-mapped IPv6 address
|
||||
// 64:ff9b::/96 - IPv4-IPv6 translation (RFC 6052)
|
||||
// 100::/64 - Discard prefix (RFC 6666)
|
||||
// 2001::/23 - IETF Protocol Assignments
|
||||
// 2001:2::/48 - Benchmarking (RFC 5180)
|
||||
// 2002::/16 - 6to4 (RFC 3056)
|
||||
// fe80::/10 - Link-local (already covered by IsLinkLocalUnicast())
|
||||
// ff00::/8 - Multicast
|
||||
return ip.Equal(net.IPv6unspecified) ||
|
||||
ip.Equal(net.ParseIP("::ffff:0:0")) ||
|
||||
ip.Equal(net.ParseIP("64:ff9b::")) ||
|
||||
ip.Equal(net.ParseIP("100::")) ||
|
||||
(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x01 && (ip[2]&0xfe) == 0) ||
|
||||
(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x01 && ip[2] == 0x00 && ip[3] == 0x02) ||
|
||||
(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x02) ||
|
||||
(len(ip) == net.IPv6len && ip[0] == 0xff)
|
||||
}
|
||||
|
||||
func IsIPv6(ipStr string) bool {
|
||||
ip := net.ParseIP(ipStr)
|
||||
return ip != nil && ip.To4() == nil
|
||||
}
|
||||
|
||||
// ValidateURLForSSRF validates a URL to prevent SSRF attacks
|
||||
// It checks:
|
||||
// - URL format is valid
|
||||
// - Scheme is http or https only
|
||||
// - No credentials in URL
|
||||
// - Hostname resolves to public IP addresses only (blocks private/reserved IPs)
|
||||
func ValidateURLForSSRF(urlStr string) error {
|
||||
// Parse and validate URL
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid URL format: %w", err)
|
||||
}
|
||||
|
||||
// Validate URL scheme (only http/https allowed)
|
||||
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
|
||||
return fmt.Errorf("invalid URL scheme: only http and https are allowed")
|
||||
}
|
||||
|
||||
// Block URLs with userinfo (credentials)
|
||||
if parsedURL.User != nil {
|
||||
return fmt.Errorf("URLs with credentials are not allowed")
|
||||
}
|
||||
|
||||
// Resolve hostname to IP and check if it's private/reserved
|
||||
hostname := parsedURL.Hostname()
|
||||
if hostname == "" {
|
||||
return fmt.Errorf("invalid URL: missing hostname")
|
||||
}
|
||||
|
||||
// Resolve the hostname to IP addresses
|
||||
ips, err := net.LookupIP(hostname)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to resolve hostname: %w", err)
|
||||
}
|
||||
|
||||
// Check if any resolved IP is private or reserved
|
||||
for _, ip := range ips {
|
||||
if IsPrivateOrReservedIP(ip.String()) {
|
||||
return fmt.Errorf("access to private/reserved IP addresses is not allowed")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
75
backend/utils/processor.go
Normal file
75
backend/utils/processor.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"io"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type Node struct {
|
||||
buf *bytes.Buffer
|
||||
son []*Node
|
||||
}
|
||||
|
||||
func newNode() *Node {
|
||||
return &Node{son: []*Node{}, buf: bytes.NewBufferString("")}
|
||||
}
|
||||
|
||||
type ProcessorTree struct {
|
||||
mu *sync.Mutex
|
||||
root *Node
|
||||
result *bytes.Buffer
|
||||
}
|
||||
|
||||
func NewProcessorTree() *ProcessorTree {
|
||||
return &ProcessorTree{
|
||||
root: newNode(),
|
||||
mu: &sync.Mutex{},
|
||||
result: bytes.NewBufferString(""),
|
||||
}
|
||||
}
|
||||
|
||||
// 获取一个father下的节点
|
||||
func (t *ProcessorTree) GetNode(farther *Node) (*Node, error) {
|
||||
if farther == nil {
|
||||
return nil, errors.New("father is nil")
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
temp := newNode()
|
||||
farther.son = append(farther.son, temp)
|
||||
return temp, nil
|
||||
}
|
||||
|
||||
func (t *ProcessorTree) Add(node *Node, data []byte) error {
|
||||
if node == nil {
|
||||
return errors.New("node is nil")
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
node.buf.Write(data)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *ProcessorTree) GetResult() ([]byte, error) {
|
||||
if err := t.getRes(t.root); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return t.result.Bytes(), nil
|
||||
}
|
||||
|
||||
func (t *ProcessorTree) getRes(node *Node) error {
|
||||
if node == nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := io.Copy(t.result, node.buf); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, son := range node.son {
|
||||
if err := t.getRes(son); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
7
backend/utils/time.go
Normal file
7
backend/utils/time.go
Normal file
@@ -0,0 +1,7 @@
|
||||
package utils
|
||||
|
||||
import "time"
|
||||
|
||||
func GetTimeHourOffset(hours int64) time.Time {
|
||||
return time.Now().Truncate(time.Hour).Add(time.Duration(hours) * time.Hour)
|
||||
}
|
||||
366
backend/utils/utils.go
Normal file
366
backend/utils/utils.go
Normal file
@@ -0,0 +1,366 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
|
||||
"github.com/google/uuid"
|
||||
"github.com/minio/minio-go/v7"
|
||||
tiktoken_loader "github.com/pkoukk/tiktoken-go-loader"
|
||||
"github.com/yuin/goldmark"
|
||||
"github.com/yuin/goldmark/ast"
|
||||
"github.com/yuin/goldmark/renderer/html"
|
||||
"github.com/yuin/goldmark/text"
|
||||
|
||||
"github.com/chaitin/panda-wiki/domain"
|
||||
"github.com/chaitin/panda-wiki/store/s3"
|
||||
)
|
||||
|
||||
// HTTPGet send http get request
|
||||
func HTTPGet(url string) ([]byte, error) {
|
||||
client := &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get %s: %v", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return io.ReadAll(resp.Body)
|
||||
}
|
||||
|
||||
// DecodeBytes decode bytes
|
||||
func DecodeBytes(data []byte) string {
|
||||
// try different encodings
|
||||
encodings := []string{"utf-8", "gbk", "gb2312", "big5"}
|
||||
for _, enc := range encodings {
|
||||
if decoded, err := decode(data, enc); err == nil {
|
||||
return decoded
|
||||
}
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
// IsURLValid check if url is valid
|
||||
func IsURLValid(urlStr string) bool {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return u.Scheme != "" && u.Host != ""
|
||||
}
|
||||
|
||||
// URLNormalize normalize url
|
||||
func URLNormalize(urlStr string) string {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return urlStr
|
||||
}
|
||||
|
||||
// remove url fragment
|
||||
u.Fragment = ""
|
||||
|
||||
// normalize path
|
||||
u.Path = path.Clean(u.Path)
|
||||
|
||||
// remove default port
|
||||
if u.Port() == "80" && u.Scheme == "http" {
|
||||
u.Host = u.Hostname()
|
||||
} else if u.Port() == "443" && u.Scheme == "https" {
|
||||
u.Host = u.Hostname()
|
||||
}
|
||||
|
||||
return u.String()
|
||||
}
|
||||
|
||||
func URLRemovePath(rawURL string) (string, error) {
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
parsedURL.Path = ""
|
||||
parsedURL.RawPath = ""
|
||||
parsedURL.RawQuery = ""
|
||||
parsedURL.Fragment = ""
|
||||
|
||||
return parsedURL.String(), nil
|
||||
}
|
||||
|
||||
// decode decode bytes with specified encoding
|
||||
func decode(data []byte, encoding string) (string, error) {
|
||||
// need to implement encoding conversion based on actual needs
|
||||
// use golang.org/x/text/encoding package
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// GetHeaderMap get header map
|
||||
func GetHeaderMap(header string) map[string]string {
|
||||
headerMap := make(map[string]string)
|
||||
for _, h := range strings.Split(header, "\n") {
|
||||
if key, value, ok := strings.Cut(h, "="); ok {
|
||||
headerMap[key] = value
|
||||
}
|
||||
}
|
||||
return headerMap
|
||||
}
|
||||
|
||||
func UrlEncode(s string) string {
|
||||
var encoded strings.Builder
|
||||
for _, r := range s {
|
||||
if r == '/' {
|
||||
encoded.WriteRune(r)
|
||||
} else if r < 128 {
|
||||
encoded.WriteRune(r)
|
||||
} else {
|
||||
encoded.WriteString(url.QueryEscape(string(r)))
|
||||
}
|
||||
}
|
||||
return encoded.String()
|
||||
}
|
||||
|
||||
func RemoveFirstDir(path string) string {
|
||||
// 分割路径为组成部分
|
||||
parts := strings.Split(filepath.ToSlash(path), "/")
|
||||
|
||||
// 确保路径有多个部分
|
||||
if len(parts) > 1 {
|
||||
return filepath.Join(parts[1:]...)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
// RemoveURLParams 去除 URL 中的查询参数
|
||||
func RemoveURLParams(rawURL string) (string, error) {
|
||||
// 解析 URL
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// 清空查询字符串部分
|
||||
parsedURL.RawQuery = ""
|
||||
|
||||
// 返回处理后的 URL
|
||||
return parsedURL.String(), nil
|
||||
}
|
||||
|
||||
func UploadImage(ctx context.Context, minioClient *s3.MinioClient, imageURL string, kbID string) (string, error) {
|
||||
if minioClient == nil {
|
||||
return "", fmt.Errorf("minio client is nil")
|
||||
}
|
||||
var data []byte
|
||||
var contentType string
|
||||
if strings.HasPrefix(imageURL, "http://") || strings.HasPrefix(imageURL, "https://") {
|
||||
resp, err := http.Get(imageURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to fetch image: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// 检查状态码
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP request failed with status: %s", resp.Status)
|
||||
}
|
||||
|
||||
// 读取图片数据
|
||||
data, err = io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read image data: %v", err)
|
||||
}
|
||||
|
||||
// 获取 Content-Type
|
||||
contentType = resp.Header.Get("Content-Type")
|
||||
} else {
|
||||
// 从本地文件系统读取图片
|
||||
var err error
|
||||
data, err = os.ReadFile(imageURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read image file: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 获取图片名称(从 URL 路径中提取)
|
||||
parsedURL, err := url.Parse(imageURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to parse URL: %v", err)
|
||||
}
|
||||
_, filename := filepath.Split(parsedURL.Path)
|
||||
// 解码可能的 URL 编码(如中文文件名)
|
||||
decodedName, err := url.PathUnescape(filename)
|
||||
if err != nil {
|
||||
decodedName = filename // 如果解码失败,使用原始名称
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(decodedName))
|
||||
if ext == "" {
|
||||
contentType = mime.TypeByExtension(ext)
|
||||
}
|
||||
if contentType == "" {
|
||||
contentType = "application/octet-stream"
|
||||
}
|
||||
imgName := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
|
||||
|
||||
if _, err := minioClient.PutObject(
|
||||
ctx,
|
||||
domain.Bucket,
|
||||
imgName,
|
||||
bytes.NewReader(data),
|
||||
int64(len(data)),
|
||||
minio.PutObjectOptions{
|
||||
ContentType: contentType,
|
||||
UserMetadata: map[string]string{
|
||||
"originalname": decodedName,
|
||||
},
|
||||
},
|
||||
); err != nil {
|
||||
return "", fmt.Errorf("failed to upload image to MinIO: %v", err)
|
||||
}
|
||||
return fmt.Sprintf("/%s/%s", domain.Bucket, imgName), nil
|
||||
}
|
||||
|
||||
func GetTitleFromMarkdown(markdown string) string {
|
||||
title := strings.TrimSpace(markdown)
|
||||
runes := []rune(title)
|
||||
if len(runes) > 60 {
|
||||
return string(runes[:60])
|
||||
}
|
||||
return title
|
||||
}
|
||||
|
||||
func ExchangeMarkDownImageUrl(
|
||||
ctx context.Context,
|
||||
mdContent []byte,
|
||||
getUrl func(ctx context.Context, originUrl *string) (string, error),
|
||||
) (string, error) {
|
||||
md := goldmark.New(
|
||||
goldmark.WithRendererOptions(
|
||||
html.WithHardWraps(),
|
||||
),
|
||||
)
|
||||
reader := text.NewReader(mdContent)
|
||||
doc := md.Parser().Parse(reader)
|
||||
|
||||
// 1. 收集图片节点和原始URL
|
||||
type imgTask struct {
|
||||
node *ast.Image
|
||||
rawUrl string
|
||||
}
|
||||
var tasks []imgTask
|
||||
|
||||
if err := ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
|
||||
if !entering {
|
||||
return ast.WalkContinue, nil
|
||||
}
|
||||
if img, ok := n.(*ast.Image); ok {
|
||||
rawUrl := string(img.Destination)
|
||||
tasks = append(tasks, imgTask{img, rawUrl})
|
||||
}
|
||||
return ast.WalkContinue, nil
|
||||
}); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// 2. 并发获取新URL
|
||||
type result struct {
|
||||
idx int
|
||||
newUrl string
|
||||
err error
|
||||
}
|
||||
|
||||
results := make(chan result, len(tasks))
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i, t := range tasks {
|
||||
wg.Add(1)
|
||||
go func(idx int, rawUrl string) {
|
||||
defer wg.Done()
|
||||
newUrl, err := getUrl(ctx, &rawUrl)
|
||||
results <- result{idx, newUrl, err}
|
||||
}(i, t.rawUrl)
|
||||
}
|
||||
|
||||
// 关闭结果通道当所有goroutine完成时
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(results)
|
||||
}()
|
||||
|
||||
// 3. 处理结果
|
||||
for res := range results {
|
||||
if res.err != nil {
|
||||
return "", res.err
|
||||
}
|
||||
tasks[res.idx].node.Destination = []byte(res.newUrl)
|
||||
}
|
||||
|
||||
// 4. 渲染Markdown
|
||||
var buf bytes.Buffer
|
||||
if err := md.Renderer().Render(&buf, mdContent, doc); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// 5. 转换并返回字符串
|
||||
conv := converter.NewConverter(
|
||||
converter.WithPlugins(
|
||||
base.NewBasePlugin(),
|
||||
commonmark.NewCommonmarkPlugin(
|
||||
commonmark.WithStrongDelimiter("__"),
|
||||
),
|
||||
),
|
||||
)
|
||||
converted, err := conv.ConvertReader(&buf)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(converted), nil
|
||||
}
|
||||
|
||||
type Localloader struct{}
|
||||
|
||||
func (m *Localloader) LoadTiktokenBpe(_ string) (map[string]int, error) {
|
||||
a := tiktoken_loader.NewOfflineLoader()
|
||||
res, err := a.LoadTiktokenBpe("cl100k_base.tiktoken")
|
||||
return res, err
|
||||
}
|
||||
|
||||
func GetFileNameWithoutExt(path string) string {
|
||||
filename := filepath.Base(path)
|
||||
return strings.TrimSuffix(filename, filepath.Ext(filename))
|
||||
}
|
||||
|
||||
func IsUUID(s string) bool {
|
||||
_, err := uuid.Parse(s)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func IsLikelyHTML(text string) bool {
|
||||
trimContent := strings.TrimSpace(text)
|
||||
return strings.HasPrefix(trimContent, "<") && strings.HasSuffix(trimContent, ">")
|
||||
}
|
||||
Reference in New Issue
Block a user