Files
YouduWiki/backend/utils/epub.go
2026-05-21 19:52:45 +08:00

431 lines
10 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package utils
import (
"archive/zip"
"bytes"
"context"
"encoding/xml"
"errors"
"fmt"
"io"
"mime/multipart"
"path/filepath"
"strings"
"sync"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
"github.com/chaitin/panda-wiki/domain"
"github.com/chaitin/panda-wiki/log"
"github.com/chaitin/panda-wiki/store/s3"
"github.com/google/uuid"
"github.com/minio/minio-go/v7"
"golang.org/x/sync/semaphore"
)
type EpubConverter struct {
logger *log.Logger
mu sync.Mutex
minioClient *s3.MinioClient
// relative path -> oss path
resources map[string]string
// id -> relative path
resourcesIdMap map[string]Item
// relative path -> id
relativePath map[string]string
}
func NewEpubConverter(logger *log.Logger, minio *s3.MinioClient) *EpubConverter {
return &EpubConverter{
logger: logger.WithModule("epubConverter"),
minioClient: minio,
resources: make(map[string]string),
resourcesIdMap: make(map[string]Item),
relativePath: make(map[string]string),
}
}
func (e *EpubConverter) Convert(ctx context.Context, kbID string, data *multipart.FileHeader) (string, []byte, error) {
reader, err := data.Open()
if err != nil {
return "", nil, err
}
defer reader.Close()
zipReader, err := zip.NewReader(reader, data.Size)
if err != nil {
return "", nil, err
}
if err := valid(zipReader); err != nil {
return "", nil, err
}
// read ./path/to/content.opf
var p *Package
if p, err = getOpf(zipReader); err != nil {
return "", nil, err
}
for _, item := range p.Manifest.Items {
e.resourcesIdMap[item.ID] = item
e.relativePath[item.Href] = item.ID
}
// resolve resource file
if err := e.uploadFile(ctx, kbID, zipReader); err != nil {
return "", nil, err
}
conv := converter.NewConverter(
converter.WithPlugins(
base.NewBasePlugin(),
commonmark.NewCommonmarkPlugin(
commonmark.WithStrongDelimiter("__"),
),
),
)
conv.Register.TagType("a", converter.TagTypeRemove, converter.PriorityStandard)
res := make(map[string]*bytes.Buffer)
var toc []map[string]string
for _, zipfile := range zipReader.File {
ext := strings.ToLower(filepath.Ext(zipfile.Name))
if ext == ".ncx" {
file, err := zipfile.Open()
if err != nil {
return "", nil, err
}
defer file.Close()
toc, err = ParseNCX(file)
if err != nil {
return "", nil, err
}
}
file, err := zipfile.Open()
if err != nil {
return "", nil, err
}
defer file.Close()
htmlStr, err := io.ReadAll(file)
if err != nil {
return "", nil, err
}
mdStr, err := conv.ConvertString((string(htmlStr)))
if err != nil {
return "", nil, err
}
e.logger.Info("convert File", "file name", clearFileName(zipfile.Name))
res[clearFileName(zipfile.Name)] = bytes.NewBufferString(mdStr)
}
// page sequence
result := bytes.NewBuffer(nil)
for _, href := range p.Guide.References {
if r, ok := res[clearFileName(href.Href)]; ok {
if _, err := io.Copy(result, r); err != nil {
return "", nil, err
}
result.WriteString("\n\n")
}
}
result.WriteString("# 目录\n\n")
for _, v := range toc {
fmt.Fprintf(result, "- [%s](#%s)\n", v["title"], v["playOrder"])
}
temp := make(map[string]string)
for _, v := range toc {
temp[v["src"]] = v["playOrder"]
}
for _, itemRef := range p.Spine.ItemRefs {
title := temp[e.resourcesIdMap[itemRef.IDRef].Href]
e.logger.Debug("add File", "file name", clearFileName(e.resourcesIdMap[itemRef.IDRef].Href))
if r, ok := res[clearFileName(e.resourcesIdMap[itemRef.IDRef].Href)]; ok {
result.WriteString("<span id=" + title + "></span>\n\n")
if _, err := io.Copy(result, r); err != nil {
return "", nil, err
}
result.WriteString("\n\n")
}
}
str, err := e.exchangeUrl(ctx, result.String())
return p.Metadata.Title, str, err
}
func clearFileName(str string) string {
str = filepath.Base(str)
return strings.Split(str, "#")[0]
}
func (e *EpubConverter) uploadFile(ctx context.Context, kbID string, zipReader *zip.Reader) error {
var wg sync.WaitGroup
errCh := make(chan error, len(zipReader.File))
sem := semaphore.NewWeighted(10) // 控制并发数为10
for _, f := range zipReader.File {
if isSkippableFile(f.Name) {
continue
}
if err := sem.Acquire(ctx, 1); err != nil {
return err // 如果获取信号量失败如context取消直接返回错误
}
wg.Add(1)
go func(f *zip.File) {
defer func() {
sem.Release(1)
wg.Done()
}()
if err := e.processFile(ctx, f, kbID); err != nil {
errCh <- err
}
}(f)
}
go func() {
wg.Wait()
close(errCh)
}()
return <-errCh // 返回第一个错误(或 nil
}
func (e *EpubConverter) processFile(ctx context.Context, f *zip.File, kbID string) error {
file, err := f.Open()
if err != nil {
return fmt.Errorf("打开文件 %s 失败: %v", f.Name, err)
}
defer file.Close()
ext := strings.ToLower(filepath.Ext(f.Name))
ossPath := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
e.mu.Lock()
e.resources[f.Name] = fmt.Sprintf("/%s/%s", domain.Bucket, ossPath)
e.mu.Unlock()
_, err = e.minioClient.PutObject(
ctx,
domain.Bucket,
ossPath,
file,
f.FileInfo().Size(),
minio.PutObjectOptions{
ContentType: e.resourcesIdMap[e.relativePath[f.Name]].MediaType,
UserMetadata: map[string]string{"originalname": filepath.Base(f.Name)},
},
)
return err
}
func isSkippableFile(name string) bool {
skipExts := map[string]bool{".html": true, ".css": true, ".xml": true /* 其他扩展名 */}
return name == "META-INF/container.xml" || name == "mimetype" || skipExts[filepath.Ext(name)]
}
func (e *EpubConverter) exchangeUrl(ctx context.Context, content string) ([]byte, error) {
// 将字符串转换为字节切片
mdContent := []byte(content)
// 定义 getUrl 函数,使用资源映射表替换 URL
getUrl := func(ctx context.Context, originUrl *string) (string, error) {
if originUrl == nil {
return "", fmt.Errorf("originUrl is nil")
}
// 查找资源映射
if newUrl, exists := e.resources[*originUrl]; exists {
return newUrl, nil
}
// 未找到映射,返回原始 URL
return *originUrl, nil
}
// 使用 ExchangeMarkDownImageUrl 处理 Markdown
processedContent, err := ExchangeMarkDownImageUrl(
ctx,
mdContent,
getUrl,
)
if err != nil {
return nil, fmt.Errorf("failed to exchange URLs: %w", err)
}
return []byte(processedContent), nil
}
// 获取 <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
func getFullPath(zipReader *zip.Reader) (string, error) {
// 定义 XML 结构体来匹配 container.xml 的内容
type Rootfile struct {
FullPath string `xml:"full-path,attr"`
MediaType string `xml:"media-type,attr"`
}
type Rootfiles struct {
Rootfile []Rootfile `xml:"rootfile"`
}
type Container struct {
XMLName xml.Name `xml:"container"`
Xmlns string `xml:"xmlns,attr"`
Version string `xml:"version,attr"`
Rootfiles Rootfiles `xml:"rootfiles"`
}
for _, f := range zipReader.File {
if f.Name == "META-INF/container.xml" {
// parse container.xml
r, err := f.Open()
if err != nil {
return "", err
}
defer r.Close()
de := xml.NewDecoder(r)
var c Container
if err := de.Decode(&c); err != nil {
return "", fmt.Errorf("failed to decode container.xml: %w", err)
}
if c.Rootfiles.Rootfile[0].FullPath == "" {
return "", errors.New("full-path not found in container.xml")
}
return c.Rootfiles.Rootfile[0].FullPath, nil
}
}
return "", errors.New("container.xml not found")
}
func valid(zipReader *zip.Reader) error {
for _, f := range zipReader.File {
if f.Name == "mimetype" {
r, err := f.Open()
if err != nil {
return err
}
defer r.Close()
var buf bytes.Buffer
if _, err := buf.ReadFrom(r); err != nil {
return fmt.Errorf("failed to read mimetype: %w", err)
}
if buf.String() != "application/epub+zip" {
return errors.New("invalid mimetype")
}
}
}
return nil
}
// Package represents the root element of the OPF file
type Package struct {
XMLName xml.Name `xml:"package"`
Spine Spine `xml:"spine"` // 内容
Guide Guide `xml:"guide"` // 封面
Manifest struct { // 资源清单
Items []Item `xml:"item"` // 资源
} `xml:"manifest"`
Metadata struct { // 元数据
Title string `xml:"dc:title"` // 标题
} `xml:"metadata"`
}
// Spine represents the spine section of the OPF file
type Spine struct {
Toc string `xml:"toc,attr"`
ItemRefs []ItemRef `xml:"itemref"`
}
// ItemRef represents an itemref in the spine section
type ItemRef struct {
IDRef string `xml:"idref,attr"`
}
// Guide represents the guide section of the OPF file
type Guide struct {
References []Reference `xml:"reference"`
}
// Reference represents a reference in the guide section
type Reference struct {
Href string `xml:"href,attr"`
Title string `xml:"title,attr"`
Type string `xml:"type,attr"`
}
// Item represents an item in the manifest section
type Item struct {
ID string `xml:"id,attr"`
Href string `xml:"href,attr"`
MediaType string `xml:"media-type,attr"`
}
func getOpf(zipReader *zip.Reader) (*Package, error) {
// read ./META_INF/container.xml
opfPath, err := getFullPath(zipReader)
if err != nil {
return nil, err
}
// read ./OEBPS/content.opf
for _, f := range zipReader.File {
if f.Name == opfPath {
r, err := f.Open()
if err != nil {
return nil, err
}
defer r.Close()
var p Package
de := xml.NewDecoder(r)
if err := de.Decode(&p); err != nil {
return nil, fmt.Errorf("解码OPF文件失败: %v", err)
}
return &p, nil
}
}
return nil, errors.New("content.opf not found")
}
// NCX 结构体定义
type NCX struct {
XMLName xml.Name `xml:"ncx"`
NavMap NavMap `xml:"navMap"`
}
type NavMap struct {
NavPoints []NavPoint `xml:"navPoint"`
}
type NavPoint struct {
ID string `xml:"id,attr"`
PlayOrder string `xml:"playOrder,attr"`
NavLabel NavLabel `xml:"navLabel"`
Content Content `xml:"content"`
}
type NavLabel struct {
Text string `xml:"text"`
}
type Content struct {
Src string `xml:"src,attr"`
}
// ParseNCX 解析 NCX 文件并返回目录信息
func ParseNCX(r io.Reader) ([]map[string]string, error) {
var ncx NCX
if err := xml.NewDecoder(r).Decode(&ncx); err != nil {
return nil, fmt.Errorf("解析NCX失败: %v", err)
}
var toc []map[string]string
for _, np := range ncx.NavMap.NavPoints {
entry := map[string]string{
"id": np.ID,
"playOrder": np.PlayOrder,
"title": np.NavLabel.Text,
"src": np.Content.Src,
}
toc = append(toc, entry)
}
return toc, nil
}