Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Migrate ePub generation to go-epub #679

Merged
merged 21 commits into from
Sep 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/gin-contrib/requestid v0.0.6
github.com/gin-contrib/static v0.0.1
github.com/gin-gonic/gin v1.9.1
github.com/go-shiori/go-epub v1.2.0
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d
github.com/go-sql-driver/mysql v1.7.1
Expand Down Expand Up @@ -57,6 +58,7 @@ require (
github.com/go-playground/validator/v10 v10.15.3 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/gofrs/uuid/v5 v5.0.0 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/google/uuid v1.3.1 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
Expand All @@ -81,6 +83,7 @@ require (
github.com/tdewolff/parse v2.3.4+incompatible // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.11 // indirect
github.com/vincent-petithory/dataurl v1.0.0 // indirect
go.etcd.io/bbolt v1.3.7 // indirect
go.uber.org/atomic v1.11.0 // indirect
golang.org/x/arch v0.5.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ github.com/go-playground/validator/v10 v10.15.3/go.mod h1:9iXMNT7sEkjXb0I+enO7QX
github.com/go-shiori/dom v0.0.0-20190930082056-9d974a4f8b25/go.mod h1:360KoNl36ftFYhjLHuEty78kWUGw8i1opEicvIDLfRk=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
github.com/go-shiori/go-epub v1.2.0 h1:c2b3DblHpNIiD8ISlQ+0Mc/tsRmn1mX1l6Q/0LzavN4=
github.com/go-shiori/go-epub v1.2.0/go.mod h1:gQCqrK+dIMLA7JMd8GxdBvhn811wb7XCa733RxWfPYw=
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad h1:3VP5Q8Mh165h2DHmXWFT4LJlwwvgTRlEuoe2vnsVnJ4=
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI=
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d h1:+SEf4hYDaAt2eyq8Xu3YyWCpnMsK8sZfbYsDRFCUgBM=
Expand Down Expand Up @@ -258,6 +260,8 @@ github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLY
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/vincent-petithory/dataurl v1.0.0 h1:cXw+kPto8NLuJtlMsI152irrVw9fRDX8AbShPRpg2CI=
github.com/vincent-petithory/dataurl v1.0.0/go.mod h1:FHafX5vmDzyP+1CQATJn7WFKc9CvnvxyvZy6I1MrG/U=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
Expand Down
218 changes: 19 additions & 199 deletions internal/core/ebook.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
package core

import (
"archive/zip"
"fmt"
"io"
"log"
"net/http"
"os"
fp "path/filepath"
"regexp"
"strconv"
"strings"

epub "github.com/go-shiori/go-epub"
"github.com/go-shiori/shiori/internal/model"
"github.com/pkg/errors"
)
Expand All @@ -20,8 +16,6 @@ import (
// The destination path `dstPath` should include file name with ".epub" extension
// The bookmark model will be used to update the UI based on whether this function is successful or not.
func GenerateEbook(req ProcessRequest, dstPath string) (book model.Bookmark, err error) {
// variable for store generated html code
var html string

book = req.Bookmark

Expand All @@ -30,8 +24,7 @@ func GenerateEbook(req ProcessRequest, dstPath string) (book model.Bookmark, err
return book, errors.New("bookmark ID is not valid")
}

// get current state of bookmark
// cheak archive and thumb
// Get current state of bookmark cheak archive and thumb
strID := strconv.Itoa(book.ID)

imagePath := fp.Join(req.DataDir, "thumb", fmt.Sprintf("%d", book.ID))
Expand All @@ -45,192 +38,45 @@ func GenerateEbook(req ProcessRequest, dstPath string) (book model.Bookmark, err
book.HasArchive = true
}

// this function create ebook from reader mode of bookmark so
// This function create ebook from reader mode of bookmark so
// we can't create ebook from PDF so we return error here if bookmark is a pdf
contentType := req.ContentType
if strings.Contains(contentType, "application/pdf") {
return book, errors.New("can't create ebook for pdf")
}

// create temporary epub file
// Create temporary epub file
tmpFile, err := os.CreateTemp("", "ebook")
if err != nil {
return book, errors.Wrap(err, "can't create temporary EPUB file")
}
defer os.Remove(tmpFile.Name())

// Create zip archive
epubWriter := zip.NewWriter(tmpFile)
// Create last line of ebook
lastline := `<hr/><p style="text-align:center">Generated By <a href="https://github.com/go-shiori/shiori">Shiori</a> From <a href="` + book.URL + `">This Page</a></p>`

// Create the mimetype file
mimetypeWriter, err := epubWriter.Create("mimetype")
// Create ebook
ebook, err := epub.NewEpub(book.Title)
if err != nil {
return book, errors.Wrap(err, "can't create mimetype")
}
_, err = mimetypeWriter.Write([]byte("application/epub+zip"))
if err != nil {
return book, errors.Wrap(err, "can't write into mimetype file")
}

// Create the container.xml file
containerWriter, err := epubWriter.Create("META-INF/container.xml")
if err != nil {
return book, errors.Wrap(err, "can't create container.xml")
}

_, err = containerWriter.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>`))
if err != nil {
return book, errors.Wrap(err, "can't write into container.xml file")
}

contentOpfWriter, err := epubWriter.Create("OEBPS/content.opf")
if err != nil {
return book, errors.Wrap(err, "can't create content.opf")
}
_, err = contentOpfWriter.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="BookId">
<metadata>
<dc:title>` + book.Title + `</dc:title>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
<item id="id" href="../style.css" media-type="text/css"/>
</manifest>
<spine toc="ncx">
<itemref idref="content"/>
</spine>
</package>`))
if err != nil {
return book, errors.Wrap(err, "can't write into container.opf file")
return book, errors.Wrap(err, "can't create EPUB")
}

// Create the style.css file
styleWriter, err := epubWriter.Create("style.css")
if err != nil {
return book, errors.Wrap(err, "can't create content.xml")
}
_, err = styleWriter.Write([]byte(`content {
display: block;
font-size: 1em;
line-height: 1.2;
padding-left: 0;
padding-right: 0;
text-align: justify;
margin: 0 5pt
}
img {
margin: auto;
display: block;
}`))
ebook.SetTitle(book.Title)
ebook.SetAuthor(book.Author)
ebook.SetDescription(book.Excerpt)
_, err = ebook.AddSection(`<h1 style="text-align:center"> `+book.Title+` </h1>`+book.HTML+lastline, book.Title, "", "")
if err != nil {
return book, errors.Wrap(err, "can't write into style.css file")
return book, errors.Wrap(err, "can't add ebook Section")
}
// Create the toc.ncx file
tocNcxWriter, err := epubWriter.Create("OEBPS/toc.ncx")
ebook.EmbedImages()
err = ebook.Write(tmpFile.Name())
if err != nil {
return book, errors.Wrap(err, "can't create toc.ncx")
return book, errors.Wrap(err, "can't create ebook file")
}
_, err = tocNcxWriter.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="urn:uuid:12345678-1234-5678-1234-567812345678"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>` + book.Title + `</text>
</docTitle>
<navMap>
<navPoint id="navPoint-1" playOrder="1">
<navLabel>
<text >` + book.Title + `</text>
</navLabel>
<content src="content.html"/>
</navPoint>
</navMap>
</ncx>`))
if err != nil {
return book, errors.Wrap(err, "can't write into toc.ncx file")
}

// get list of images tag in html
imageList, _ := GetImages(book.HTML)
imgRegex := regexp.MustCompile(`<img.*?src="([^"]*)".*?>`)

// Create a set to store unique image URLs
imageSet := make(map[string]bool)

// Download image in html file and generate new html
html = book.HTML
for _, match := range imgRegex.FindAllStringSubmatch(book.HTML, -1) {
imageURL := match[1]
if _, ok := imageList[imageURL]; ok && !imageSet[imageURL] {
// Add the image URL to the set
imageSet[imageURL] = true

// Download the image
resp, err := http.Get(imageURL)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()

// Get the image data
imageData, err := io.ReadAll(resp.Body)
if err != nil {
return book, errors.Wrap(err, "can't get image from the internet")
}

fileName := fp.Base(imageURL)
filePath := "images/" + fileName
imageWriter, err := epubWriter.Create(filePath)
if err != nil {
log.Fatal(err)
}

// Write the image to the file
_, err = imageWriter.Write(imageData)
if err != nil {
return book, errors.Wrap(err, "can't create image file")
}
// Replace the image tag with the new downloaded image
html = strings.ReplaceAll(html, match[0], fmt.Sprintf(`<img src="../%s"/>`, filePath))
}
}
// Create the content.html file
contentHtmlWriter, err := epubWriter.Create("OEBPS/content.html")
if err != nil {
return book, errors.Wrap(err, "can't create content.xml")
}
_, err = contentHtmlWriter.Write([]byte("<?xml version='1.0' encoding='utf-8'?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n\t<title>" + book.Title + "</title>\n\t<link href=\"../style.css\" rel=\"stylesheet\" type=\"text/css\"/>\n</head>\n<body>\n\t<h1 dir=\"auto\">" + book.Title + "</h1>" + "\n<content dir=\"auto\">\n" + html + "\n</content>" + "\n</body></html>"))
if err != nil {
return book, errors.Wrap(err, "can't write into content.html")
}
// close epub and tmpFile
err = epubWriter.Close()
if err != nil {
return book, errors.Wrap(err, "failed to close EPUB writer")
}
err = tmpFile.Close()
if err != nil {
return book, errors.Wrap(err, "failed to close temporary EPUB file")
}
// open temporary file again
tmpFile, err = os.Open(tmpFile.Name())
if err != nil {
return book, errors.Wrap(err, "can't open temporary EPUB file")
}
defer tmpFile.Close()
// if everitings go well we start move ebook to dstPath

// If everything go well we move ebook to dstPath
err = MoveFileToDestination(dstPath, tmpFile)
if err != nil {
return book, errors.Wrap(err, "failed move ebook to destination")
Expand All @@ -239,29 +85,3 @@ img {
book.HasEbook = true
return book, nil
}

// function get html and return list of image url inside html file
func GetImages(html string) (map[string]string, error) {
// Regular expression to match image tags and their URLs
imageTagRegex := regexp.MustCompile(`<img.*?src="(.*?)".*?>`)

// Find all matches in the HTML string
imageTagMatches := imageTagRegex.FindAllStringSubmatch(html, -1)
// Create a dictionary to store the image URLs
images := make(map[string]string)

// Check if there are any matches
if len(imageTagMatches) == 0 {
return nil, nil
}

// Loop through all the matches and add them to the dictionary
for _, match := range imageTagMatches {
imageURL := match[1]
if !strings.HasPrefix(imageURL, "data:image/") {
images[imageURL] = match[0]
}
}

return images, nil
}
71 changes: 0 additions & 71 deletions internal/core/ebook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,74 +171,3 @@ func TestGenerateEbook(t *testing.T) {
})
})
}

// Add more unit tests for other scenarios that missing specialy
// can't create ebook directory and can't write situatuin
// writing inside zip file
// html variable that not export and image download loop

func TestGetImages(t *testing.T) {
// Test case 1: HTML with no image tags
html1 := `<html><body><h1>Hello, World!</h1></body></html>`
expected1 := make(map[string]string)
result1, err1 := core.GetImages(html1)
if err1 != nil {
t.Errorf("Unexpected error: %v", err1)
}
if len(result1) != len(expected1) {
t.Errorf("Expected %d images, but got %d", len(expected1), len(result1))
}

// Test case 2: HTML with one image tag
html2 := `<html><body><img src="image1.jpg"></body></html>`
expected2 := map[string]string{"image1.jpg": "<img src=\"image1.jpg\">"}
result2, err2 := core.GetImages(html2)
if err2 != nil {
t.Errorf("Unexpected error: %v", err2)
}
if len(result2) != len(expected2) {
t.Errorf("Expected %d images, but got %d", len(expected2), len(result2))
}
for key, value := range expected2 {
if result2[key] != value {
t.Errorf("Expected image URL %s with tag %s, but got %s", key, value, result2[key])
}
}

// Test case 3: HTML with multiple image tags
html3 := `<html><body><img src="image1.jpg"><img src="image2.jpg"></body></html>`
expected3 := map[string]string{
"image1.jpg": "<img src=\"image1.jpg\">",
"image2.jpg": "<img src=\"image2.jpg\">",
}
result3, err3 := core.GetImages(html3)
if err3 != nil {
t.Errorf("Unexpected error: %v", err3)
}
if len(result3) != len(expected3) {
t.Errorf("Expected %d images, but got %d", len(expected3), len(result3))
}
for key, value := range expected3 {
if result3[key] != value {
t.Errorf("Expected image URL %s with tag %s, but got %s", key, value, result3[key])
}
}
// Test case 4: HTML with multiple image tags with duplicayr
html4 := `<html><body><img src="image1.jpg"><img src="image2.jpg"><img src="image2.jpg"></body></html>`
expected4 := map[string]string{
"image1.jpg": "<img src=\"image1.jpg\">",
"image2.jpg": "<img src=\"image2.jpg\">",
}
result4, err4 := core.GetImages(html4)
if err4 != nil {
t.Errorf("Unexpected error: %v", err4)
}
if len(result4) != len(expected4) {
t.Errorf("Expected %d images, but got %d", len(expected4), len(result4))
}
for key, value := range expected4 {
if result4[key] != value {
t.Errorf("Expected image URL %s with tag %s, but got %s", key, value, result4[key])
}
}
}
Loading