package utils
import (
"archive/zip"
"bytes"
"context"
"encoding/xml"
"errors"
"fmt"
"io"
"mime/multipart"
"path/filepath"
"strings"
"sync"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
"github.com/chaitin/panda-wiki/domain"
"github.com/chaitin/panda-wiki/log"
"github.com/chaitin/panda-wiki/store/s3"
"github.com/google/uuid"
"github.com/minio/minio-go/v7"
"golang.org/x/sync/semaphore"
)
type EpubConverter struct {
logger *log.Logger
mu sync.Mutex
minioClient *s3.MinioClient
// relative path -> oss path
resources map[string]string
// id -> relative path
resourcesIdMap map[string]Item
// relative path -> id
relativePath map[string]string
}
func NewEpubConverter(logger *log.Logger, minio *s3.MinioClient) *EpubConverter {
return &EpubConverter{
logger: logger.WithModule("epubConverter"),
minioClient: minio,
resources: make(map[string]string),
resourcesIdMap: make(map[string]Item),
relativePath: make(map[string]string),
}
}
func (e *EpubConverter) Convert(ctx context.Context, kbID string, data *multipart.FileHeader) (string, []byte, error) {
reader, err := data.Open()
if err != nil {
return "", nil, err
}
defer reader.Close()
zipReader, err := zip.NewReader(reader, data.Size)
if err != nil {
return "", nil, err
}
if err := valid(zipReader); err != nil {
return "", nil, err
}
// read ./path/to/content.opf
var p *Package
if p, err = getOpf(zipReader); err != nil {
return "", nil, err
}
for _, item := range p.Manifest.Items {
e.resourcesIdMap[item.ID] = item
e.relativePath[item.Href] = item.ID
}
// resolve resource file
if err := e.uploadFile(ctx, kbID, zipReader); err != nil {
return "", nil, err
}
conv := converter.NewConverter(
converter.WithPlugins(
base.NewBasePlugin(),
commonmark.NewCommonmarkPlugin(
commonmark.WithStrongDelimiter("__"),
),
),
)
conv.Register.TagType("a", converter.TagTypeRemove, converter.PriorityStandard)
res := make(map[string]*bytes.Buffer)
var toc []map[string]string
for _, zipfile := range zipReader.File {
ext := strings.ToLower(filepath.Ext(zipfile.Name))
if ext == ".ncx" {
file, err := zipfile.Open()
if err != nil {
return "", nil, err
}
defer file.Close()
toc, err = ParseNCX(file)
if err != nil {
return "", nil, err
}
}
file, err := zipfile.Open()
if err != nil {
return "", nil, err
}
defer file.Close()
htmlStr, err := io.ReadAll(file)
if err != nil {
return "", nil, err
}
mdStr, err := conv.ConvertString((string(htmlStr)))
if err != nil {
return "", nil, err
}
e.logger.Info("convert File", "file name", clearFileName(zipfile.Name))
res[clearFileName(zipfile.Name)] = bytes.NewBufferString(mdStr)
}
// page sequence
result := bytes.NewBuffer(nil)
for _, href := range p.Guide.References {
if r, ok := res[clearFileName(href.Href)]; ok {
if _, err := io.Copy(result, r); err != nil {
return "", nil, err
}
result.WriteString("\n\n")
}
}
result.WriteString("# 目录\n\n")
for _, v := range toc {
fmt.Fprintf(result, "- [%s](#%s)\n", v["title"], v["playOrder"])
}
temp := make(map[string]string)
for _, v := range toc {
temp[v["src"]] = v["playOrder"]
}
for _, itemRef := range p.Spine.ItemRefs {
title := temp[e.resourcesIdMap[itemRef.IDRef].Href]
e.logger.Debug("add File", "file name", clearFileName(e.resourcesIdMap[itemRef.IDRef].Href))
if r, ok := res[clearFileName(e.resourcesIdMap[itemRef.IDRef].Href)]; ok {
result.WriteString("\n\n")
if _, err := io.Copy(result, r); err != nil {
return "", nil, err
}
result.WriteString("\n\n")
}
}
str, err := e.exchangeUrl(ctx, result.String())
return p.Metadata.Title, str, err
}
func clearFileName(str string) string {
str = filepath.Base(str)
return strings.Split(str, "#")[0]
}
func (e *EpubConverter) uploadFile(ctx context.Context, kbID string, zipReader *zip.Reader) error {
var wg sync.WaitGroup
errCh := make(chan error, len(zipReader.File))
sem := semaphore.NewWeighted(10) // 控制并发数为10
for _, f := range zipReader.File {
if isSkippableFile(f.Name) {
continue
}
if err := sem.Acquire(ctx, 1); err != nil {
return err // 如果获取信号量失败(如context取消),直接返回错误
}
wg.Add(1)
go func(f *zip.File) {
defer func() {
sem.Release(1)
wg.Done()
}()
if err := e.processFile(ctx, f, kbID); err != nil {
errCh <- err
}
}(f)
}
go func() {
wg.Wait()
close(errCh)
}()
return <-errCh // 返回第一个错误(或 nil)
}
func (e *EpubConverter) processFile(ctx context.Context, f *zip.File, kbID string) error {
file, err := f.Open()
if err != nil {
return fmt.Errorf("打开文件 %s 失败: %v", f.Name, err)
}
defer file.Close()
ext := strings.ToLower(filepath.Ext(f.Name))
ossPath := fmt.Sprintf("%s/%s%s", kbID, uuid.New().String(), ext)
e.mu.Lock()
e.resources[f.Name] = fmt.Sprintf("/%s/%s", domain.Bucket, ossPath)
e.mu.Unlock()
_, err = e.minioClient.PutObject(
ctx,
domain.Bucket,
ossPath,
file,
f.FileInfo().Size(),
minio.PutObjectOptions{
ContentType: e.resourcesIdMap[e.relativePath[f.Name]].MediaType,
UserMetadata: map[string]string{"originalname": filepath.Base(f.Name)},
},
)
return err
}
func isSkippableFile(name string) bool {
skipExts := map[string]bool{".html": true, ".css": true, ".xml": true /* 其他扩展名 */}
return name == "META-INF/container.xml" || name == "mimetype" || skipExts[filepath.Ext(name)]
}
func (e *EpubConverter) exchangeUrl(ctx context.Context, content string) ([]byte, error) {
// 将字符串转换为字节切片
mdContent := []byte(content)
// 定义 getUrl 函数,使用资源映射表替换 URL
getUrl := func(ctx context.Context, originUrl *string) (string, error) {
if originUrl == nil {
return "", fmt.Errorf("originUrl is nil")
}
// 查找资源映射
if newUrl, exists := e.resources[*originUrl]; exists {
return newUrl, nil
}
// 未找到映射,返回原始 URL
return *originUrl, nil
}
// 使用 ExchangeMarkDownImageUrl 处理 Markdown
processedContent, err := ExchangeMarkDownImageUrl(
ctx,
mdContent,
getUrl,
)
if err != nil {
return nil, fmt.Errorf("failed to exchange URLs: %w", err)
}
return []byte(processedContent), nil
}
// 获取
func getFullPath(zipReader *zip.Reader) (string, error) {
// 定义 XML 结构体来匹配 container.xml 的内容
type Rootfile struct {
FullPath string `xml:"full-path,attr"`
MediaType string `xml:"media-type,attr"`
}
type Rootfiles struct {
Rootfile []Rootfile `xml:"rootfile"`
}
type Container struct {
XMLName xml.Name `xml:"container"`
Xmlns string `xml:"xmlns,attr"`
Version string `xml:"version,attr"`
Rootfiles Rootfiles `xml:"rootfiles"`
}
for _, f := range zipReader.File {
if f.Name == "META-INF/container.xml" {
// parse container.xml
r, err := f.Open()
if err != nil {
return "", err
}
defer r.Close()
de := xml.NewDecoder(r)
var c Container
if err := de.Decode(&c); err != nil {
return "", fmt.Errorf("failed to decode container.xml: %w", err)
}
if c.Rootfiles.Rootfile[0].FullPath == "" {
return "", errors.New("full-path not found in container.xml")
}
return c.Rootfiles.Rootfile[0].FullPath, nil
}
}
return "", errors.New("container.xml not found")
}
func valid(zipReader *zip.Reader) error {
for _, f := range zipReader.File {
if f.Name == "mimetype" {
r, err := f.Open()
if err != nil {
return err
}
defer r.Close()
var buf bytes.Buffer
if _, err := buf.ReadFrom(r); err != nil {
return fmt.Errorf("failed to read mimetype: %w", err)
}
if buf.String() != "application/epub+zip" {
return errors.New("invalid mimetype")
}
}
}
return nil
}
// Package represents the root element of the OPF file
type Package struct {
XMLName xml.Name `xml:"package"`
Spine Spine `xml:"spine"` // 内容
Guide Guide `xml:"guide"` // 封面
Manifest struct { // 资源清单
Items []Item `xml:"item"` // 资源
} `xml:"manifest"`
Metadata struct { // 元数据
Title string `xml:"dc:title"` // 标题
} `xml:"metadata"`
}
// Spine represents the spine section of the OPF file
type Spine struct {
Toc string `xml:"toc,attr"`
ItemRefs []ItemRef `xml:"itemref"`
}
// ItemRef represents an itemref in the spine section
type ItemRef struct {
IDRef string `xml:"idref,attr"`
}
// Guide represents the guide section of the OPF file
type Guide struct {
References []Reference `xml:"reference"`
}
// Reference represents a reference in the guide section
type Reference struct {
Href string `xml:"href,attr"`
Title string `xml:"title,attr"`
Type string `xml:"type,attr"`
}
// Item represents an item in the manifest section
type Item struct {
ID string `xml:"id,attr"`
Href string `xml:"href,attr"`
MediaType string `xml:"media-type,attr"`
}
func getOpf(zipReader *zip.Reader) (*Package, error) {
// read ./META_INF/container.xml
opfPath, err := getFullPath(zipReader)
if err != nil {
return nil, err
}
// read ./OEBPS/content.opf
for _, f := range zipReader.File {
if f.Name == opfPath {
r, err := f.Open()
if err != nil {
return nil, err
}
defer r.Close()
var p Package
de := xml.NewDecoder(r)
if err := de.Decode(&p); err != nil {
return nil, fmt.Errorf("解码OPF文件失败: %v", err)
}
return &p, nil
}
}
return nil, errors.New("content.opf not found")
}
// NCX 结构体定义
type NCX struct {
XMLName xml.Name `xml:"ncx"`
NavMap NavMap `xml:"navMap"`
}
type NavMap struct {
NavPoints []NavPoint `xml:"navPoint"`
}
type NavPoint struct {
ID string `xml:"id,attr"`
PlayOrder string `xml:"playOrder,attr"`
NavLabel NavLabel `xml:"navLabel"`
Content Content `xml:"content"`
}
type NavLabel struct {
Text string `xml:"text"`
}
type Content struct {
Src string `xml:"src,attr"`
}
// ParseNCX 解析 NCX 文件并返回目录信息
func ParseNCX(r io.Reader) ([]map[string]string, error) {
var ncx NCX
if err := xml.NewDecoder(r).Decode(&ncx); err != nil {
return nil, fmt.Errorf("解析NCX失败: %v", err)
}
var toc []map[string]string
for _, np := range ncx.NavMap.NavPoints {
entry := map[string]string{
"id": np.ID,
"playOrder": np.PlayOrder,
"title": np.NavLabel.Text,
"src": np.Content.Src,
}
toc = append(toc, entry)
}
return toc, nil
}