Skip to content

Commit

Permalink
fix: normalize extracted video poster paths
Browse files Browse the repository at this point in the history
Fixes #414
  • Loading branch information
harlan-zw committed Jan 28, 2025
1 parent e022206 commit ed18278
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 15 deletions.
6 changes: 6 additions & 0 deletions src/prerender.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import chalk from 'chalk'
import { dirname } from 'pathe'
import { defu } from 'defu'
import type { ConsolaInstance } from 'consola'
import { withSiteUrl } from 'nuxt-site-config/kit'
import { extractSitemapMetaFromHtml } from './util/extractSitemapMetaFromHtml'
import type { ModuleRuntimeConfig, SitemapUrl } from './runtime/types'
import { splitForLocales } from './runtime/utils-pure'
Expand Down Expand Up @@ -80,12 +81,17 @@ export function setupPrerenderHandler(_options: { runtimeConfig: ModuleRuntimeCo
route._sitemap._sitemap = _sitemap
}
}

route._sitemap = defu(extractSitemapMetaFromHtml(html, {
images: options.discoverImages,
videos: options.discoverVideos,
// TODO configurable?
lastmod: true,
alternatives: true,
resolveUrl(s) {
// if the match is relative
return s.startsWith('/') ? withSiteUrl(s) : s
},
}), route._sitemap) as SitemapUrl
})
nitro.hooks.hook('prerender:done', async () => {
Expand Down
24 changes: 9 additions & 15 deletions src/util/extractSitemapMetaFromHtml.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import { withSiteUrl } from 'nuxt-site-config/kit'
import { parseURL } from 'ufo'
import { tryUseNuxt } from '@nuxt/kit'
import type { ResolvedSitemapUrl, SitemapUrl, VideoEntry } from '../runtime/types'

const videoRegex = /<video[^>]*>([\s\S]*?)<\/video>/g
Expand All @@ -20,13 +18,14 @@ const videoLiveRegex = /<video[^>]*\sdata-live="([^"]+)"/
const videoTagRegex = /<video[^>]*\sdata-tag="([^"]+)"/
const sourceRegex = /<source[^>]*\ssrc="([^"]+)"/g

export function extractSitemapMetaFromHtml(html: string, options?: { images?: boolean, videos?: boolean, lastmod?: boolean, alternatives?: boolean }) {
export function extractSitemapMetaFromHtml(html: string, options?: { images?: boolean, videos?: boolean, lastmod?: boolean, alternatives?: boolean, resolveUrl?: (s: string) => string }) {
options = options || { images: true, videos: true, lastmod: true, alternatives: true }
const payload: Partial<SitemapUrl> = {}
const resolveUrl = options?.resolveUrl || ((s: string) => s)
const mainRegex = /<main[^>]*>([\s\S]*?)<\/main>/
const mainMatch = mainRegex.exec(html)
if (options?.images) {
const images = new Set<string>()
const mainRegex = /<main[^>]*>([\s\S]*?)<\/main>/
const mainMatch = mainRegex.exec(html)
if (mainMatch?.[1] && mainMatch[1].includes('<img')) {
// Extract image src attributes using regex on the HTML, but ignore elements with invalid values such as data:, blob:, or file:
// eslint-disable-next-line regexp/no-useless-lazy,regexp/no-super-linear-backtracking
Expand All @@ -37,10 +36,7 @@ export function extractSitemapMetaFromHtml(html: string, options?: { images?: bo
// This is necessary to avoid infinite loops with zero-width matches
if (match.index === imgRegex.lastIndex)
imgRegex.lastIndex++
let url = match[1]
// if the match is relative
if (url.startsWith('/'))
url = tryUseNuxt() ? withSiteUrl(url) : url
const url = resolveUrl(match[1])
images.add(url)
}
}
Expand All @@ -50,9 +46,6 @@ export function extractSitemapMetaFromHtml(html: string, options?: { images?: bo

if (options?.videos) {
const videos = []
const mainRegex = /<main[^>]*>([\s\S]*?)<\/main>/
const mainMatch = mainRegex.exec(html)

if (mainMatch?.[1] && mainMatch[1].includes('<video')) {
let videoMatch
while ((videoMatch = videoRegex.exec(mainMatch[1])) !== null) {
Expand Down Expand Up @@ -109,11 +102,12 @@ export function extractSitemapMetaFromHtml(html: string, options?: { images?: bo

if (sources.length > 0) {
videos.push(...sources.map((source) => {
if (source.startsWith('/'))
source = tryUseNuxt() ? withSiteUrl(source) : source
if (videoObj.thumbnail_loc) {
videoObj.thumbnail_loc = resolveUrl(String(videoObj.thumbnail_loc))
}
return {
...videoObj,
content_loc: source,
content_loc: resolveUrl(source),
}
}))
}
Expand Down
36 changes: 36 additions & 0 deletions test/unit/extractSitemapMetaFromHtml.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -309,4 +309,40 @@ describe('extractSitemapMetaFromHtml', () => {
}
`)
})
it('extracts relative poster as absolute', async () => {
const testcase5 = extractSitemapMetaFromHtml(`
<main>
<video
controls
src="https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4"
poster="/poster.jpg"
width="620"
data-title="Big Buck Bunny"
data-description="Big Buck Bunny in DivX 720p."
>
<source
src="https://archive.org/download/DuckAndCover_185/CivilDefenseFilm-DuckAndCoverColdWarNuclearPropaganda_512kb.mp4"
type="video/mp4"
/>
</video>
</main>
`, {
videos: true,
resolveUrl(s) {
return s.startsWith('/') ? `https://example.com${s}` : s
},
})
expect(testcase5).toMatchInlineSnapshot(`
{
"videos": [
{
"content_loc": "https://archive.org/download/DuckAndCover_185/CivilDefenseFilm-DuckAndCoverColdWarNuclearPropaganda_512kb.mp4",
"description": "Big Buck Bunny in DivX 720p.",
"thumbnail_loc": "https://example.com/poster.jpg",
"title": "Big Buck Bunny",
},
],
}
`)
})
})

0 comments on commit ed18278

Please sign in to comment.