init push

This commit is contained in:
2026-05-21 19:52:45 +08:00
commit e3f75311ab
1280 changed files with 179173 additions and 0 deletions

182
backend/utils/DFA.go Normal file
View File

@@ -0,0 +1,182 @@
package utils
import (
"errors"
"sync"
)
var (
dfaInstance map[string]*DFAInstance
mu sync.RWMutex
)
type DFAInstance struct {
DFA *DFA
BuffSize int
}
// GetDFA returns the singleton instance of DFA
func GetDFA(kbID string) *DFAInstance {
mu.RLock()
defer mu.RUnlock()
return dfaInstance[kbID]
}
// InitDFA Initialize a new DFA. --> this func used by pro
func InitDFA(kbID string, words []string) {
mu.Lock()
defer mu.Unlock()
newDFA := &DFA{
Root: NewTrieNode(),
}
var BuffSize int // 默认为0
for _, word := range words {
newDFA.AddWord(word)
if BuffSize < len([]rune(word)) {
BuffSize = len([]rune(word))
}
}
if dfaInstance == nil {
dfaInstance = make(map[string]*DFAInstance)
}
dfaInstance[kbID] = &DFAInstance{
DFA: newDFA,
BuffSize: BuffSize,
}
}
// TrieNode Define the nodes of DFA
type TrieNode struct {
Children map[rune]*TrieNode
IsEnd bool
}
// NewTrieNode Create a new Trie node
func NewTrieNode() *TrieNode {
return &TrieNode{
Children: make(map[rune]*TrieNode),
IsEnd: false,
}
}
// DFA The structure contains the root node of the DFA
type DFA struct {
Root *TrieNode
}
// AddWord Add sensitive words to DFA
func (d *DFA) AddWord(word string) {
node := d.Root
for _, char := range word {
if _, exists := node.Children[char]; !exists {
node.Children[char] = NewTrieNode()
}
node = node.Children[char]
}
node.IsEnd = true
}
// UpdateOldWord update old word
func (d *DFA) UpdateOldWord(oldWord, newWord string) {
d.DeleteWord(oldWord)
d.AddWord(newWord)
}
// DeleteWord delete word
func (d *DFA) DeleteWord(word string) bool {
result := []rune(word)
// 辅助函数用于递归删除节点
var deleteNode func(node *TrieNode, index int) bool
deleteNode = func(node *TrieNode, index int) bool {
if index == len(result) {
// 如果该词不存在,直接返回
if !node.IsEnd {
return false
}
// 清除该词的结束标记
node.IsEnd = false
// 如果该节点没有子节点,可以删除
return len(node.Children) == 0
}
char := result[index]
child, exists := node.Children[char]
if !exists {
return false // 如果路径不存在,则不做任何操作
}
// 递归删除子节点
shouldDeleteChild := deleteNode(child, index+1)
if shouldDeleteChild {
// 删除当前节点的子节点
delete(node.Children, char)
// 如果当前节点没有其他子节点且不是词尾节点,返回 true
return len(node.Children) == 0 && !node.IsEnd
}
return false
}
// 调用递归函数删除指定的词
return deleteNode(d.Root, 0)
}
// DeleteWordBatch delete word batch
func (d *DFA) DeleteWordBatch(words []string) {
wg := sync.WaitGroup{}
for _, word := range words {
wg.Add(1)
go func() {
d.DeleteWord(word)
wg.Done()
}()
}
wg.Wait()
}
// Filter the input text and replace sensitive words
func (d *DFA) Filter(text string) string {
result := []rune(text) // 转化为rune
for i := 0; i < len(result); i++ { // 外层循环,遍历每个字符作为起始点
node := d.Root
j := i
for j < len(result) { // 内层循环,尝试匹配敏感词
if nextNode, exists := node.Children[result[j]]; exists { // 如果当前字符在子节点中存在
node = nextNode // 下移
if node.IsEnd { // 是否为结尾,即匹配到敏感词,替换为*
for k := i; k <= j; k++ {
result[k] = '🚫'
}
}
j++ // next char
} else {
break
}
}
}
return string(result)
}
// Check if the input text contains sensitive words
func (d *DFA) Check(text string) error {
result := []rune(text)
for i := 0; i < len(result); {
node := d.Root
start := i
matched := false
for j := i; j < len(result); j++ {
char := result[j]
if nextNode, exists := node.Children[char]; exists {
node = nextNode
if node.IsEnd {
return errors.New("包含敏感词: " + string(result[start:j+1]))
}
} else {
break
}
}
if !matched {
i++
}
}
return nil
}

430
backend/utils/epub.go Normal file
View File

@@ -0,0 +1,430 @@
package utils
import (
"archive/zip"
"bytes"
"context"
"encoding/xml"
"errors"
"fmt"
"io"
"mime/multipart"
"path/filepath"
"strings"
"sync"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
"github.com/chaitin/panda-wiki/domain"
"github.com/chaitin/panda-wiki/log"
"github.com/chaitin/panda-wiki/store/s3"
"github.com/google/uuid"
"github.com/minio/minio-go/v7"
"golang.org/x/sync/semaphore"
)
type EpubConverter struct {
logger *log.Logger
mu sync.Mutex
minioClient *s3.MinioClient
// relative path -> oss path
resources map[string]string
// id -> relative path
resourcesIdMap map[string]Item
// relative path -> id
relativePath map[string]string
}
func NewEpubConverter(logger *log.Logger, minio *s3.MinioClient) *EpubConverter {
return &EpubConverter{
logger: logger.WithModule("epubConverter"),
minioClient: minio,
resources: make(map[string]string),
resourcesIdMap: make(map[string]Item),
relativePath: make(map[string]string),
}
}
func (e *EpubConverter) Convert(ctx context.Context, kbID string, data *multipart.FileHeader) (string, []byte, error) {
reader, err := data.Open()
if err != nil {
return "", nil, err
}
defer reader.Close()
zipReader, err := zip.NewReader(reader, data.Size)
if err != nil {
return "", nil, err
}
if err := valid(zipReader); err != nil {
return "", nil, err
}
// read ./path/to/content.opf
var p *Package
if p, err = getOpf(zipReader); err != nil {
return "", nil, err
}
for _, item := range p.Manifest.Items {
e.resourcesIdMap[item.ID] = item
e.relativePath[item.Href] = item.ID
}
// resolve resource file
if err := e.uploadFile(ctx, kbID, zipReader); err != nil {
return "", nil, err
}
conv := converter.NewConverter(
converter.WithPlugins(
base.NewBasePlugin(),
commonmark.NewCommonmarkPlugin(
commonmark.WithStrongDelimiter("__"),
),
),
)
conv.Register.TagType("a", converter.TagTypeRemove, converter.PriorityStandard)
res := make(map[string]*bytes.Buffer)
var toc []map[string]string
for _, zipfile := range zipReader.File {
ext := strings.ToLower(filepath.Ext(zipfile.Name))
if ext == ".ncx" {
file, err := zipfile.Open()
if err != nil {
return "", nil, err
}
defer file.Close()
toc, err = ParseNCX(file)
if err != nil {
return "", nil, err
}
}
file, err := zipfile.Open()
if err != nil {
return "", nil, err
}
defer file.Close()
htmlStr, err := io.ReadAll(file)
if err != nil {
return "", nil, err
}
mdStr, err := conv.ConvertString((string(htmlStr)))
if err != nil {
return "", nil, err
}
e.logger.Info("convert File", "file name", clearFileName(zipfile.Name))
res[clearFileName(zipfile.Name)] = bytes.NewBufferString(mdStr)
}
// page sequence
result := bytes.NewBuffer(nil)
for _, href := range p.Guide.References {
if r, ok := res[clearFileName(href.Href)]; ok {
if _, err := io.Copy(result, r); err != nil {
return "", nil, err
}
result.WriteString("\n\n")
}
}
result.WriteString("# 目录\n\n")
for _, v := range toc {
fmt.Fprintf(result, "- [%s](#%s)\n", v["title"], v["playOrder"])
}
temp := make(map[string]string)
for _, v := range toc {
temp[v["src"]] = v["playOrder"]
}
for _, itemRef := range p.Spine.ItemRefs {
title := temp[e.resourcesIdMap[itemRef.IDRef].Href]
e.logger.Debug("add File", "file name", clearFileName(e.resourcesIdMap[itemRef.IDRef].Href))
if r, ok := res[clearFileName(e.resourcesIdMap[itemRef.IDRef].Href)]; ok {
result.WriteString("<span id=" + title + "></span>\n\n")
if _, err := io.Copy(result, r); err != nil {
return "", nil, err
}
result.WriteString("\n\n")
}
}
str, err := e.exchangeUrl(ctx, result.String())
return p.Metadata.Title, str, err
}
func clearFileName(str string) string {
str = filepath.Base(str)
return strings.Split(str, "#")[0]
}
func (e *EpubConverter) uploadFile(ctx context.Context, kbID string, zipReader *zip.Reader) error {
var wg sync.WaitGroup
errCh := make(chan error, len(zipReader.File))
sem := semaphore.NewWeighted(10) // 控制并发数为10
for _, f := range zipReader.File {
if isSkippableFile(f.Name) {
continue
}
if err := sem.Acquire(ctx, 1); err != nil {
return err // 如果获取信号量失败如context取消直接返回错误
}
wg.Add(1)
go func(f *zip.File) {
defer func() {
sem.Release(1)
wg.Done()
}()
if err := e.processFile(ctx, f, kbID); err != nil {
errCh <- err
}
}(f)
}
go func() {
wg.Wait()
close(errCh)
}()
return <-errCh // 返回第一个错误(或 nil
}
func (e *EpubConverter) processFile(ctx context.Context, f *zip.File, kbID string) error {
file, err := f.Open()
if err != nil {
return fmt.Errorf("打开文件 %s 失败: %v", f.Name, err)
}
defer file.Close()
ext := strings.ToLower(filepath.Ext(f.Name))
ossPath := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
e.mu.Lock()
e.resources[f.Name] = fmt.Sprintf("/%s/%s", domain.Bucket, ossPath)
e.mu.Unlock()
_, err = e.minioClient.PutObject(
ctx,
domain.Bucket,
ossPath,
file,
f.FileInfo().Size(),
minio.PutObjectOptions{
ContentType: e.resourcesIdMap[e.relativePath[f.Name]].MediaType,
UserMetadata: map[string]string{"originalname": filepath.Base(f.Name)},
},
)
return err
}
func isSkippableFile(name string) bool {
skipExts := map[string]bool{".html": true, ".css": true, ".xml": true /* 其他扩展名 */}
return name == "META-INF/container.xml" || name == "mimetype" || skipExts[filepath.Ext(name)]
}
func (e *EpubConverter) exchangeUrl(ctx context.Context, content string) ([]byte, error) {
// 将字符串转换为字节切片
mdContent := []byte(content)
// 定义 getUrl 函数,使用资源映射表替换 URL
getUrl := func(ctx context.Context, originUrl *string) (string, error) {
if originUrl == nil {
return "", fmt.Errorf("originUrl is nil")
}
// 查找资源映射
if newUrl, exists := e.resources[*originUrl]; exists {
return newUrl, nil
}
// 未找到映射,返回原始 URL
return *originUrl, nil
}
// 使用 ExchangeMarkDownImageUrl 处理 Markdown
processedContent, err := ExchangeMarkDownImageUrl(
ctx,
mdContent,
getUrl,
)
if err != nil {
return nil, fmt.Errorf("failed to exchange URLs: %w", err)
}
return []byte(processedContent), nil
}
// 获取 <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
func getFullPath(zipReader *zip.Reader) (string, error) {
// 定义 XML 结构体来匹配 container.xml 的内容
type Rootfile struct {
FullPath string `xml:"full-path,attr"`
MediaType string `xml:"media-type,attr"`
}
type Rootfiles struct {
Rootfile []Rootfile `xml:"rootfile"`
}
type Container struct {
XMLName xml.Name `xml:"container"`
Xmlns string `xml:"xmlns,attr"`
Version string `xml:"version,attr"`
Rootfiles Rootfiles `xml:"rootfiles"`
}
for _, f := range zipReader.File {
if f.Name == "META-INF/container.xml" {
// parse container.xml
r, err := f.Open()
if err != nil {
return "", err
}
defer r.Close()
de := xml.NewDecoder(r)
var c Container
if err := de.Decode(&c); err != nil {
return "", fmt.Errorf("failed to decode container.xml: %w", err)
}
if c.Rootfiles.Rootfile[0].FullPath == "" {
return "", errors.New("full-path not found in container.xml")
}
return c.Rootfiles.Rootfile[0].FullPath, nil
}
}
return "", errors.New("container.xml not found")
}
func valid(zipReader *zip.Reader) error {
for _, f := range zipReader.File {
if f.Name == "mimetype" {
r, err := f.Open()
if err != nil {
return err
}
defer r.Close()
var buf bytes.Buffer
if _, err := buf.ReadFrom(r); err != nil {
return fmt.Errorf("failed to read mimetype: %w", err)
}
if buf.String() != "application/epub+zip" {
return errors.New("invalid mimetype")
}
}
}
return nil
}
// Package represents the root element of the OPF file
type Package struct {
XMLName xml.Name `xml:"package"`
Spine Spine `xml:"spine"` // 内容
Guide Guide `xml:"guide"` // 封面
Manifest struct { // 资源清单
Items []Item `xml:"item"` // 资源
} `xml:"manifest"`
Metadata struct { // 元数据
Title string `xml:"dc:title"` // 标题
} `xml:"metadata"`
}
// Spine represents the spine section of the OPF file
type Spine struct {
Toc string `xml:"toc,attr"`
ItemRefs []ItemRef `xml:"itemref"`
}
// ItemRef represents an itemref in the spine section
type ItemRef struct {
IDRef string `xml:"idref,attr"`
}
// Guide represents the guide section of the OPF file
type Guide struct {
References []Reference `xml:"reference"`
}
// Reference represents a reference in the guide section
type Reference struct {
Href string `xml:"href,attr"`
Title string `xml:"title,attr"`
Type string `xml:"type,attr"`
}
// Item represents an item in the manifest section
type Item struct {
ID string `xml:"id,attr"`
Href string `xml:"href,attr"`
MediaType string `xml:"media-type,attr"`
}
func getOpf(zipReader *zip.Reader) (*Package, error) {
// read ./META_INF/container.xml
opfPath, err := getFullPath(zipReader)
if err != nil {
return nil, err
}
// read ./OEBPS/content.opf
for _, f := range zipReader.File {
if f.Name == opfPath {
r, err := f.Open()
if err != nil {
return nil, err
}
defer r.Close()
var p Package
de := xml.NewDecoder(r)
if err := de.Decode(&p); err != nil {
return nil, fmt.Errorf("解码OPF文件失败: %v", err)
}
return &p, nil
}
}
return nil, errors.New("content.opf not found")
}
// NCX 结构体定义
type NCX struct {
XMLName xml.Name `xml:"ncx"`
NavMap NavMap `xml:"navMap"`
}
type NavMap struct {
NavPoints []NavPoint `xml:"navPoint"`
}
type NavPoint struct {
ID string `xml:"id,attr"`
PlayOrder string `xml:"playOrder,attr"`
NavLabel NavLabel `xml:"navLabel"`
Content Content `xml:"content"`
}
type NavLabel struct {
Text string `xml:"text"`
}
type Content struct {
Src string `xml:"src,attr"`
}
// ParseNCX 解析 NCX 文件并返回目录信息
func ParseNCX(r io.Reader) ([]map[string]string, error) {
var ncx NCX
if err := xml.NewDecoder(r).Decode(&ncx); err != nil {
return nil, fmt.Errorf("解析NCX失败: %v", err)
}
var toc []map[string]string
for _, np := range ncx.NavMap.NavPoints {
entry := map[string]string{
"id": np.ID,
"playOrder": np.PlayOrder,
"title": np.NavLabel.Text,
"src": np.Content.Src,
}
toc = append(toc, entry)
}
return toc, nil
}

239
backend/utils/feed.go Normal file
View File

@@ -0,0 +1,239 @@
package utils
import (
"encoding/json"
"encoding/xml"
"fmt"
"strings"
)
// FeedItem represents a single item in any feed format
// FeedItem 表示任意Feed格式中的单个条目
// 字段说明:
// Title: 条目标题
// Link: 条目链接URL
// Description: 条目描述内容
// Published: 发布时间字符串格式具体格式由Feed源决定
type FeedItem struct {
Title string // 条目标题
Link string // 条目链接URL
Description string // 条目描述内容
Published string // 发布时间(字符串格式)
}
// Feed represents a generic feed structure
type Feed struct {
Title string
Description string
Link string
Items []FeedItem
}
// cleanXMLContent removes illegal XML characters from the content
func cleanXMLContent(content string) string {
return strings.Map(func(r rune) rune {
// Check if the character is a valid XML character
// XML 1.0 spec: https://www.w3.org/TR/xml/#charsets
if r == 0x9 || r == 0xA || r == 0xD || (r >= 0x20 && r <= 0xD7FF) || (r >= 0xE000 && r <= 0xFFFD) || (r >= 0x10000 && r <= 0x10FFFF) {
return r
}
return -1 // Remove invalid characters
}, content)
}
// ParseFeed 解析指定URL的Feed内容返回通用Feed结构
// 参数:
// url: 要解析的Feed内容URL
// 返回值:
// *Feed: 解析后的通用Feed结构包含标题、描述、链接和条目列表
// error: 解析过程中出现的错误(网络错误、格式不支持等)
func ParseFeed(url string) (*Feed, error) {
// Get feed content
content, err := HTTPGet(url)
if err != nil {
return nil, fmt.Errorf("failed to get feed content: %v", err)
}
// Decode content
decoded := DecodeBytes(content)
// Clean illegal XML characters
cleaned := cleanXMLContent(decoded)
decodedBytes := []byte(cleaned)
// Try to detect feed format and parse accordingly
if strings.Contains(cleaned, "<rss") {
return parseRSS(decodedBytes)
} else if strings.Contains(cleaned, "<feed") {
return parseAtom(decodedBytes)
} else if strings.Contains(cleaned, "\"version\":") {
return parseJSONFeed(decodedBytes)
}
return nil, fmt.Errorf("unsupported feed format")
}
// parseRSS 解析RSS格式如RSS 2.0)的内容
// 参数content - RSS格式的字节内容
// 返回值解析后的通用Feed结构或错误
// 注意处理链接时按以下优先级获取link标签的href属性 > link标签文本值 > Atom扩展链接 > Guid永久链接
func parseRSS(content []byte) (*Feed, error) {
type RSSFeed struct {
XMLName xml.Name `xml:"rss"`
Channel struct {
Title string `xml:"title"`
Description string `xml:"description"`
Link string `xml:"link"`
AtomLink struct {
Href string `xml:"href,attr"`
} `xml:"http://www.w3.org/2005/Atom link"`
Items []struct {
Title string `xml:"title"`
Links []struct {
Href string `xml:"href,attr"`
Value string `xml:",chardata"`
} `xml:"link"`
Description string `xml:"description"`
PubDate string `xml:"pubDate"`
Guid struct {
IsPermaLink string `xml:"isPermaLink,attr"`
Value string `xml:",chardata"`
} `xml:"guid"`
AtomLink struct {
Href string `xml:"href,attr"`
} `xml:"http://www.w3.org/2005/Atom link"`
} `xml:"item"`
} `xml:"channel"`
}
var rssFeed RSSFeed
if err := xml.Unmarshal(content, &rssFeed); err != nil {
return nil, fmt.Errorf("failed to parse RSS: %v", err)
}
feed := &Feed{
Title: rssFeed.Channel.Title,
Description: rssFeed.Channel.Description,
Link: rssFeed.Channel.Link,
Items: make([]FeedItem, 0),
}
for _, item := range rssFeed.Channel.Items {
feedItem := FeedItem{
Title: item.Title,
Description: item.Description,
Published: item.PubDate,
}
// Try to get link from various sources in order of preference
if len(item.Links) > 0 {
// Try href attribute first, then value
if item.Links[0].Href != "" {
feedItem.Link = item.Links[0].Href
} else if item.Links[0].Value != "" {
feedItem.Link = item.Links[0].Value
}
} else if item.AtomLink.Href != "" {
feedItem.Link = item.AtomLink.Href
} else if item.Guid.Value != "" && (item.Guid.IsPermaLink == "" || item.Guid.IsPermaLink == "true") {
feedItem.Link = item.Guid.Value
}
feed.Items = append(feed.Items, feedItem)
}
return feed, nil
}
// parseAtom 解析Atom 1.0格式的内容
// 参数content - Atom格式的字节内容
// 返回值解析后的通用Feed结构或错误
// 注意Feed链接取第一个link元素的href属性建议优先使用rel="alternate"的链接)
func parseAtom(content []byte) (*Feed, error) {
type AtomFeed struct {
XMLName xml.Name `xml:"feed"`
Title string `xml:"title"`
Subtitle string `xml:"subtitle"`
Link []struct {
Href string `xml:"href,attr"`
} `xml:"link"`
Entries []struct {
Title string `xml:"title"`
Link []struct {
Href string `xml:"href,attr"`
} `xml:"link"`
Summary string `xml:"summary"`
Updated string `xml:"updated"`
} `xml:"entry"`
}
var atomFeed AtomFeed
if err := xml.Unmarshal(content, &atomFeed); err != nil {
return nil, fmt.Errorf("failed to parse Atom: %v", err)
}
feed := &Feed{
Title: atomFeed.Title,
Description: atomFeed.Subtitle,
Items: make([]FeedItem, 0),
}
if len(atomFeed.Link) > 0 {
feed.Link = atomFeed.Link[0].Href
}
for _, entry := range atomFeed.Entries {
item := FeedItem{
Title: entry.Title,
Description: entry.Summary,
Published: entry.Updated,
}
if len(entry.Link) > 0 {
item.Link = entry.Link[0].Href
}
feed.Items = append(feed.Items, item)
}
return feed, nil
}
// parseJSONFeed 解析JSON Feed格式如1.1版本)的内容
// 参数content - JSON Feed格式的字节内容
// 返回值解析后的通用Feed结构或错误
// 字段映射home_page_url -> Feed.Link; date_published -> FeedItem.Published
func parseJSONFeed(content []byte) (*Feed, error) {
type JSONFeed struct {
Version string `json:"version"`
Title string `json:"title"`
Description string `json:"description"`
HomePageURL string `json:"home_page_url"`
Items []struct {
Title string `json:"title"`
URL string `json:"url"`
ContentText string `json:"content_text"`
DatePublished string `json:"date_published"`
} `json:"items"`
}
var jsonFeed JSONFeed
if err := json.Unmarshal(content, &jsonFeed); err != nil {
return nil, fmt.Errorf("failed to parse JSON Feed: %v", err)
}
feed := &Feed{
Title: jsonFeed.Title,
Description: jsonFeed.Description,
Link: jsonFeed.HomePageURL,
Items: make([]FeedItem, 0),
}
for _, item := range jsonFeed.Items {
feed.Items = append(feed.Items, FeedItem{
Title: item.Title,
Link: item.URL,
Description: item.ContentText,
Published: item.DatePublished,
})
}
return feed, nil
}

16
backend/utils/file.go Normal file
View File

@@ -0,0 +1,16 @@
package utils
import (
"path/filepath"
"slices"
"strings"
)
func IsImageFile(filename string) bool {
ext := strings.ToLower(filepath.Ext(filename))
supportedImageExts := []string{
".jpg", ".jpeg", ".png", ".webp",
}
return slices.Contains(supportedImageExts, ext)
}

188
backend/utils/ip_addr.go Normal file
View File

@@ -0,0 +1,188 @@
package utils
import (
"fmt"
"net"
"net/http"
"net/netip"
"net/url"
"strings"
"github.com/labstack/echo/v4"
)
var documentationPrefixes = []netip.Prefix{
netip.MustParsePrefix("192.0.2.0/24"), // TEST-NET-1
netip.MustParsePrefix("198.51.100.0/24"), // TEST-NET-2
netip.MustParsePrefix("203.0.113.0/24"), // TEST-NET-3
netip.MustParsePrefix("2001:db8::/32"), // IPv6 Documentation
}
func GetClientIPFromRemoteAddr(c echo.Context) string {
return ExtractHostFromRemoteAddr(c.Request())
}
func ExtractHostFromRemoteAddr(r *http.Request) string {
addr := r.RemoteAddr
if addr == "" {
return ""
}
host, _, err := net.SplitHostPort(addr)
if err != nil {
return strings.TrimSpace(addr)
}
return host
}
// IsPrivateOrReservedIP checks if the given IP address is private or reserved
func IsPrivateOrReservedIP(ipStr string) bool {
ip := net.ParseIP(ipStr)
if ip == nil {
return false // Invalid IP address
}
// Private IP ranges:
// IPv4:
// 10.0.0.0/8
// 172.16.0.0/12
// 192.168.0.0/16
// IPv6:
// fc00::/7 (Unique Local Addresses)
if ip.IsPrivate() {
return true
}
// Loopback addresses:
// IPv4: 127.0.0.0/8
// IPv6: ::1/128
if ip.IsLoopback() {
return true
}
// Link-local addresses:
// IPv4: 169.254.0.0/16
// IPv6: fe80::/10
if ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
return true
}
// Documentation addresses:
// IPv4:
// 192.0.2.0/24 (TEST-NET-1)
// 198.51.100.0/24 (TEST-NET-2)
// 203.0.113.0/24 (TEST-NET-3)
// IPv6:
// 2001:db8::/32
if isDocumentationIP(ip) {
return true
}
// Other reserved ranges
return isOtherReservedIP(ip)
}
func isDocumentationIP(ip net.IP) bool {
addr, ok := netip.AddrFromSlice(ip)
if !ok {
return false
}
// 统一处理映射地址,确保比对逻辑一致
addr = addr.Unmap()
for _, prefix := range documentationPrefixes {
if prefix.Contains(addr) {
return true
}
}
return false
}
// isOtherReservedIP checks for other reserved IP ranges
func isOtherReservedIP(ip net.IP) bool {
if ip4 := ip.To4(); ip4 != nil {
// Other reserved IPv4 ranges:
// 0.0.0.0/8 - Current network (RFC 1122)
// 100.64.0.0/10 - Shared Address Space (RFC 6598)
// 192.0.0.0/24 - IETF Protocol Assignments (RFC 6890)
// 192.88.99.0/24 - IPv6 to IPv4 relay (RFC 3068)
// 198.18.0.0/15 - Network benchmark tests (RFC 2544)
// 240.0.0.0/4 - Reserved (RFC 1112)
return ip4[0] == 0 ||
(ip4[0] == 100 && (ip4[1]&0xc0) == 64) ||
(ip4[0] == 192 && ip4[1] == 0 && ip4[2] == 0) ||
(ip4[0] == 192 && ip4[1] == 88 && ip4[2] == 99) ||
(ip4[0] == 198 && (ip4[1]&0xfe) == 18) ||
(ip4[0]&0xf0) == 240
}
// Other reserved IPv6 ranges:
// ::/128 - Unspecified address
// ::1/128 - Loopback address (already covered by IsLoopback())
// ::ffff:0:0/96 - IPv4-mapped IPv6 address
// 64:ff9b::/96 - IPv4-IPv6 translation (RFC 6052)
// 100::/64 - Discard prefix (RFC 6666)
// 2001::/23 - IETF Protocol Assignments
// 2001:2::/48 - Benchmarking (RFC 5180)
// 2002::/16 - 6to4 (RFC 3056)
// fe80::/10 - Link-local (already covered by IsLinkLocalUnicast())
// ff00::/8 - Multicast
return ip.Equal(net.IPv6unspecified) ||
ip.Equal(net.ParseIP("::ffff:0:0")) ||
ip.Equal(net.ParseIP("64:ff9b::")) ||
ip.Equal(net.ParseIP("100::")) ||
(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x01 && (ip[2]&0xfe) == 0) ||
(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x01 && ip[2] == 0x00 && ip[3] == 0x02) ||
(len(ip) == net.IPv6len && ip[0] == 0x20 && ip[1] == 0x02) ||
(len(ip) == net.IPv6len && ip[0] == 0xff)
}
func IsIPv6(ipStr string) bool {
ip := net.ParseIP(ipStr)
return ip != nil && ip.To4() == nil
}
// ValidateURLForSSRF validates a URL to prevent SSRF attacks
// It checks:
// - URL format is valid
// - Scheme is http or https only
// - No credentials in URL
// - Hostname resolves to public IP addresses only (blocks private/reserved IPs)
func ValidateURLForSSRF(urlStr string) error {
// Parse and validate URL
parsedURL, err := url.Parse(urlStr)
if err != nil {
return fmt.Errorf("invalid URL format: %w", err)
}
// Validate URL scheme (only http/https allowed)
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return fmt.Errorf("invalid URL scheme: only http and https are allowed")
}
// Block URLs with userinfo (credentials)
if parsedURL.User != nil {
return fmt.Errorf("URLs with credentials are not allowed")
}
// Resolve hostname to IP and check if it's private/reserved
hostname := parsedURL.Hostname()
if hostname == "" {
return fmt.Errorf("invalid URL: missing hostname")
}
// Resolve the hostname to IP addresses
ips, err := net.LookupIP(hostname)
if err != nil {
return fmt.Errorf("failed to resolve hostname: %w", err)
}
// Check if any resolved IP is private or reserved
for _, ip := range ips {
if IsPrivateOrReservedIP(ip.String()) {
return fmt.Errorf("access to private/reserved IP addresses is not allowed")
}
}
return nil
}

View File

@@ -0,0 +1,75 @@
package utils
import (
"bytes"
"errors"
"io"
"sync"
)
type Node struct {
buf *bytes.Buffer
son []*Node
}
func newNode() *Node {
return &Node{son: []*Node{}, buf: bytes.NewBufferString("")}
}
type ProcessorTree struct {
mu *sync.Mutex
root *Node
result *bytes.Buffer
}
func NewProcessorTree() *ProcessorTree {
return &ProcessorTree{
root: newNode(),
mu: &sync.Mutex{},
result: bytes.NewBufferString(""),
}
}
// 获取一个father下的节点
func (t *ProcessorTree) GetNode(farther *Node) (*Node, error) {
if farther == nil {
return nil, errors.New("father is nil")
}
t.mu.Lock()
defer t.mu.Unlock()
temp := newNode()
farther.son = append(farther.son, temp)
return temp, nil
}
func (t *ProcessorTree) Add(node *Node, data []byte) error {
if node == nil {
return errors.New("node is nil")
}
t.mu.Lock()
defer t.mu.Unlock()
node.buf.Write(data)
return nil
}
func (t *ProcessorTree) GetResult() ([]byte, error) {
if err := t.getRes(t.root); err != nil {
return nil, err
}
return t.result.Bytes(), nil
}
func (t *ProcessorTree) getRes(node *Node) error {
if node == nil {
return nil
}
if _, err := io.Copy(t.result, node.buf); err != nil {
return err
}
for _, son := range node.son {
if err := t.getRes(son); err != nil {
return err
}
}
return nil
}

7
backend/utils/time.go Normal file
View File

@@ -0,0 +1,7 @@
package utils
import "time"
func GetTimeHourOffset(hours int64) time.Time {
return time.Now().Truncate(time.Hour).Add(time.Duration(hours) * time.Hour)
}

366
backend/utils/utils.go Normal file
View File

@@ -0,0 +1,366 @@
package utils
import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"strings"
"sync"
"time"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
"github.com/google/uuid"
"github.com/minio/minio-go/v7"
tiktoken_loader "github.com/pkoukk/tiktoken-go-loader"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/renderer/html"
"github.com/yuin/goldmark/text"
"github.com/chaitin/panda-wiki/domain"
"github.com/chaitin/panda-wiki/store/s3"
)
// HTTPGet send http get request
func HTTPGet(url string) ([]byte, error) {
client := &http.Client{
Timeout: 10 * time.Second,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
}
resp, err := client.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to get %s: %v", url, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
return io.ReadAll(resp.Body)
}
// DecodeBytes decode bytes
func DecodeBytes(data []byte) string {
// try different encodings
encodings := []string{"utf-8", "gbk", "gb2312", "big5"}
for _, enc := range encodings {
if decoded, err := decode(data, enc); err == nil {
return decoded
}
}
return string(data)
}
// IsURLValid check if url is valid
func IsURLValid(urlStr string) bool {
u, err := url.Parse(urlStr)
if err != nil {
return false
}
return u.Scheme != "" && u.Host != ""
}
// URLNormalize normalize url
func URLNormalize(urlStr string) string {
u, err := url.Parse(urlStr)
if err != nil {
return urlStr
}
// remove url fragment
u.Fragment = ""
// normalize path
u.Path = path.Clean(u.Path)
// remove default port
if u.Port() == "80" && u.Scheme == "http" {
u.Host = u.Hostname()
} else if u.Port() == "443" && u.Scheme == "https" {
u.Host = u.Hostname()
}
return u.String()
}
func URLRemovePath(rawURL string) (string, error) {
parsedURL, err := url.Parse(rawURL)
if err != nil {
return "", err
}
parsedURL.Path = ""
parsedURL.RawPath = ""
parsedURL.RawQuery = ""
parsedURL.Fragment = ""
return parsedURL.String(), nil
}
// decode decode bytes with specified encoding
func decode(data []byte, encoding string) (string, error) {
// need to implement encoding conversion based on actual needs
// use golang.org/x/text/encoding package
return string(data), nil
}
// GetHeaderMap get header map
func GetHeaderMap(header string) map[string]string {
headerMap := make(map[string]string)
for _, h := range strings.Split(header, "\n") {
if key, value, ok := strings.Cut(h, "="); ok {
headerMap[key] = value
}
}
return headerMap
}
func UrlEncode(s string) string {
var encoded strings.Builder
for _, r := range s {
if r == '/' {
encoded.WriteRune(r)
} else if r < 128 {
encoded.WriteRune(r)
} else {
encoded.WriteString(url.QueryEscape(string(r)))
}
}
return encoded.String()
}
func RemoveFirstDir(path string) string {
// 分割路径为组成部分
parts := strings.Split(filepath.ToSlash(path), "/")
// 确保路径有多个部分
if len(parts) > 1 {
return filepath.Join(parts[1:]...)
}
return path
}
// RemoveURLParams 去除 URL 中的查询参数
func RemoveURLParams(rawURL string) (string, error) {
// 解析 URL
parsedURL, err := url.Parse(rawURL)
if err != nil {
return "", err
}
// 清空查询字符串部分
parsedURL.RawQuery = ""
// 返回处理后的 URL
return parsedURL.String(), nil
}
func UploadImage(ctx context.Context, minioClient *s3.MinioClient, imageURL string, kbID string) (string, error) {
if minioClient == nil {
return "", fmt.Errorf("minio client is nil")
}
var data []byte
var contentType string
if strings.HasPrefix(imageURL, "http://") || strings.HasPrefix(imageURL, "https://") {
resp, err := http.Get(imageURL)
if err != nil {
return "", fmt.Errorf("failed to fetch image: %v", err)
}
defer resp.Body.Close()
// 检查状态码
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP request failed with status: %s", resp.Status)
}
// 读取图片数据
data, err = io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read image data: %v", err)
}
// 获取 Content-Type
contentType = resp.Header.Get("Content-Type")
} else {
// 从本地文件系统读取图片
var err error
data, err = os.ReadFile(imageURL)
if err != nil {
return "", fmt.Errorf("failed to read image file: %v", err)
}
}
// 获取图片名称(从 URL 路径中提取)
parsedURL, err := url.Parse(imageURL)
if err != nil {
return "", fmt.Errorf("failed to parse URL: %v", err)
}
_, filename := filepath.Split(parsedURL.Path)
// 解码可能的 URL 编码(如中文文件名)
decodedName, err := url.PathUnescape(filename)
if err != nil {
decodedName = filename // 如果解码失败,使用原始名称
}
ext := strings.ToLower(filepath.Ext(decodedName))
if ext == "" {
contentType = mime.TypeByExtension(ext)
}
if contentType == "" {
contentType = "application/octet-stream"
}
imgName := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
if _, err := minioClient.PutObject(
ctx,
domain.Bucket,
imgName,
bytes.NewReader(data),
int64(len(data)),
minio.PutObjectOptions{
ContentType: contentType,
UserMetadata: map[string]string{
"originalname": decodedName,
},
},
); err != nil {
return "", fmt.Errorf("failed to upload image to MinIO: %v", err)
}
return fmt.Sprintf("/%s/%s", domain.Bucket, imgName), nil
}
func GetTitleFromMarkdown(markdown string) string {
title := strings.TrimSpace(markdown)
runes := []rune(title)
if len(runes) > 60 {
return string(runes[:60])
}
return title
}
func ExchangeMarkDownImageUrl(
ctx context.Context,
mdContent []byte,
getUrl func(ctx context.Context, originUrl *string) (string, error),
) (string, error) {
md := goldmark.New(
goldmark.WithRendererOptions(
html.WithHardWraps(),
),
)
reader := text.NewReader(mdContent)
doc := md.Parser().Parse(reader)
// 1. 收集图片节点和原始URL
type imgTask struct {
node *ast.Image
rawUrl string
}
var tasks []imgTask
if err := ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if !entering {
return ast.WalkContinue, nil
}
if img, ok := n.(*ast.Image); ok {
rawUrl := string(img.Destination)
tasks = append(tasks, imgTask{img, rawUrl})
}
return ast.WalkContinue, nil
}); err != nil {
return "", err
}
// 2. 并发获取新URL
type result struct {
idx int
newUrl string
err error
}
results := make(chan result, len(tasks))
var wg sync.WaitGroup
for i, t := range tasks {
wg.Add(1)
go func(idx int, rawUrl string) {
defer wg.Done()
newUrl, err := getUrl(ctx, &rawUrl)
results <- result{idx, newUrl, err}
}(i, t.rawUrl)
}
// 关闭结果通道当所有goroutine完成时
go func() {
wg.Wait()
close(results)
}()
// 3. 处理结果
for res := range results {
if res.err != nil {
return "", res.err
}
tasks[res.idx].node.Destination = []byte(res.newUrl)
}
// 4. 渲染Markdown
var buf bytes.Buffer
if err := md.Renderer().Render(&buf, mdContent, doc); err != nil {
return "", err
}
// 5. 转换并返回字符串
conv := converter.NewConverter(
converter.WithPlugins(
base.NewBasePlugin(),
commonmark.NewCommonmarkPlugin(
commonmark.WithStrongDelimiter("__"),
),
),
)
converted, err := conv.ConvertReader(&buf)
if err != nil {
return "", err
}
return string(converted), nil
}
type Localloader struct{}
func (m *Localloader) LoadTiktokenBpe(_ string) (map[string]int, error) {
a := tiktoken_loader.NewOfflineLoader()
res, err := a.LoadTiktokenBpe("cl100k_base.tiktoken")
return res, err
}
func GetFileNameWithoutExt(path string) string {
filename := filepath.Base(path)
return strings.TrimSuffix(filename, filepath.Ext(filename))
}
func IsUUID(s string) bool {
_, err := uuid.Parse(s)
return err == nil
}
func IsLikelyHTML(text string) bool {
trimContent := strings.TrimSpace(text)
return strings.HasPrefix(trimContent, "<") && strings.HasSuffix(trimContent, ">")
}