Skip to content

Commit 620f11b

Browse files
authored
Fix XHTML sniffing and add a sniffing round with only media types hints (#167)
1 parent f8d35a2 commit 620f11b

File tree

5 files changed

+53
-18
lines changed

5 files changed

+53
-18
lines changed

readium/shared/r2-shared/src/main/java/org/readium/r2/shared/util/mediatype/MediaType.kt

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -446,18 +446,31 @@ class MediaType(
446446
* - Heavy Sniffing reads the bytes to perform more advanced sniffing.
447447
*/
448448
private suspend fun of(content: SnifferContent?, mediaTypes: List<String>, fileExtensions: List<String>, sniffers: List<Sniffer>): MediaType? {
449-
// Light sniffing
450-
var context = SnifferContext(mediaTypes = mediaTypes, fileExtensions = fileExtensions)
451-
for (sniffer in sniffers) {
452-
val mediaType = sniffer(context)
453-
if (mediaType != null) {
454-
return mediaType
449+
// Light sniffing with only media type hints
450+
if (mediaTypes.isNotEmpty()) {
451+
val context = SnifferContext(mediaTypes = mediaTypes)
452+
for (sniffer in sniffers) {
453+
val mediaType = sniffer(context)
454+
if (mediaType != null) {
455+
return mediaType
456+
}
457+
}
458+
}
459+
460+
// Light sniffing with both media type hints and file extensions
461+
if (fileExtensions.isNotEmpty()) {
462+
val context = SnifferContext(mediaTypes = mediaTypes, fileExtensions = fileExtensions)
463+
for (sniffer in sniffers) {
464+
val mediaType = sniffer(context)
465+
if (mediaType != null) {
466+
return mediaType
467+
}
455468
}
456469
}
457470

458471
// Heavy sniffing
459472
if (content != null) {
460-
context = SnifferContext(content = content, mediaTypes = mediaTypes, fileExtensions = fileExtensions)
473+
val context = SnifferContext(content = content, mediaTypes = mediaTypes, fileExtensions = fileExtensions)
461474
for (sniffer in sniffers) {
462475
val mediaType = sniffer(context)
463476
if (mediaType != null) {
@@ -470,6 +483,7 @@ class MediaType(
470483
// Note: This is done after the heavy sniffing of the provided [sniffers], because
471484
// otherwise it will detect JSON, XML or ZIP formats before we have a chance of sniffing
472485
// their content (for example, for RWPM).
486+
val context = SnifferContext(content = content, mediaTypes = mediaTypes, fileExtensions = fileExtensions)
473487
Sniffers.system(context)?.let { return it }
474488

475489
// If nothing else worked, we try to parse the first valid media type hint.

readium/shared/r2-shared/src/main/java/org/readium/r2/shared/util/mediatype/Sniffer.kt

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,30 @@ object Sniffers {
3333
* The sniffers order is important, because some formats are subsets of other formats.
3434
*/
3535
val all: List<Sniffer> = listOf(
36-
::html, ::opds, ::lcpLicense, ::bitmap, ::webpub, ::w3cWPUB, ::epub, ::lpf, ::archive,
36+
::xhtml, ::html, ::opds, ::lcpLicense, ::bitmap, ::webpub, ::w3cWPUB, ::epub, ::lpf, ::archive,
3737
::pdf, ::json
3838
)
3939

40+
/**
41+
* Sniffs an XHTML document.
42+
*
43+
* Must precede the HTML sniffer.
44+
*/
45+
suspend fun xhtml(context: SnifferContext): MediaType? {
46+
if (context.hasFileExtension("xht", "xhtml") || context.hasMediaType("application/xhtml+xml")) {
47+
return MediaType.XHTML
48+
}
49+
context.contentAsXml()?.let {
50+
if (it.name.lowercase(Locale.ROOT) == "html" && it.namespace.lowercase(Locale.ROOT).contains("xhtml")) {
51+
return MediaType.XHTML
52+
}
53+
}
54+
return null
55+
}
56+
4057
/** Sniffs an HTML document. */
4158
suspend fun html(context: SnifferContext): MediaType? {
42-
if (context.hasFileExtension("htm", "html", "xht", "xhtml") || context.hasMediaType("text/html", "application/xhtml+xml")) {
59+
if (context.hasFileExtension("htm", "html") || context.hasMediaType("text/html")) {
4360
return MediaType.HTML
4461
}
4562
// [contentAsXml] will fail if the HTML is not a proper XML document, hence the doctype check.

readium/shared/r2-shared/src/main/java/org/readium/r2/shared/util/mediatype/SnifferContext.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ import java.util.*
3232
*/
3333
class SnifferContext internal constructor(
3434
private val content: SnifferContent? = null,
35-
mediaTypes: List<String>,
36-
fileExtensions: List<String>
35+
mediaTypes: List<String> = emptyList(),
36+
fileExtensions: List<String> = emptyList()
3737
) {
3838

3939
/** Media type hints. */

readium/shared/r2-shared/src/test/java/org/readium/r2/shared/fetcher/ArchiveFetcherTest.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,13 @@ class ArchiveFetcherTest {
6060
assertEquals(
6161
listOf(
6262
createLink("/mimetype", null),
63-
createLink("/EPUB/cover.xhtml" , "text/html", 259L),
63+
createLink("/EPUB/cover.xhtml" , "application/xhtml+xml", 259L),
6464
createLink("/EPUB/css/epub.css", "text/css", 595L),
6565
createLink("/EPUB/css/nav.css", "text/css", 306L),
6666
createLink("/EPUB/images/cover.png", "image/png", 35809L),
67-
createLink("/EPUB/nav.xhtml", "text/html", 2293L),
67+
createLink("/EPUB/nav.xhtml", "application/xhtml+xml", 2293L),
6868
createLink("/EPUB/package.opf", null, 773L),
69-
createLink("/EPUB/s04.xhtml", "text/html", 118269L),
69+
createLink("/EPUB/s04.xhtml", "application/xhtml+xml", 118269L),
7070
createLink("/EPUB/toc.ncx", null, 1697),
7171
createLink("/META-INF/container.xml", "text/xml", 176)
7272
),

readium/shared/r2-shared/src/test/java/org/readium/r2/shared/util/mediatype/SnifferTest.kt

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,17 @@ class SnifferTest {
133133
fun `sniff HTML`() = runBlocking {
134134
assertEquals(MediaType.HTML, MediaType.of(fileExtension = "htm"))
135135
assertEquals(MediaType.HTML, MediaType.of(fileExtension = "html"))
136-
assertEquals(MediaType.HTML, MediaType.of(fileExtension = "xht"))
137-
assertEquals(MediaType.HTML, MediaType.of(fileExtension = "xhtml"))
138136
assertEquals(MediaType.HTML, MediaType.of(mediaType = "text/html"))
139-
assertEquals(MediaType.HTML, MediaType.of(mediaType = "application/xhtml+xml"))
140137
assertEquals(MediaType.HTML, MediaType.ofFile(fixtures.fileAt("html.unknown")))
141138
assertEquals(MediaType.HTML, MediaType.ofFile(fixtures.fileAt("html-doctype-case.unknown")))
142-
assertEquals(MediaType.HTML, MediaType.ofFile(fixtures.fileAt("xhtml.unknown")))
139+
}
140+
141+
@Test
142+
fun `sniff XHTML`() = runBlocking {
143+
assertEquals(MediaType.XHTML, MediaType.of(fileExtension = "xht"))
144+
assertEquals(MediaType.XHTML, MediaType.of(fileExtension = "xhtml"))
145+
assertEquals(MediaType.XHTML, MediaType.of(mediaType = "application/xhtml+xml"))
146+
assertEquals(MediaType.XHTML, MediaType.ofFile(fixtures.fileAt("xhtml.unknown")))
143147
}
144148

145149
@Test

0 commit comments

Comments
 (0)