Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 61 additions & 3 deletions cpp/parser/GumboNormalizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,34 @@ static void emit_one_attr(buffer_t *out, GumboElement *el,
}
}

static bool is_checkbox_list(GumboElement *el) {
const char *val = get_attr(el, "data-type");
if (val && (strcmp(val, "checkbox") == 0 || strcmp(val, "checkboxList") == 0)) {
return true;
}

// In Google Docs and MS Word the <li> elements define if it is a checkbox
// list. We only need to check the first <li>.
GumboVector *children = &el->children;
for (unsigned int i = 0; i < children->length; i++) {
GumboNode *child = children->data[i];
if (is_element(child)) {
char child_tag[64];
if (get_tag_name(child, child_tag, sizeof(child_tag)) && strcmp(child_tag, "li") == 0) {
GumboElement *child_el = &child->v.element;
const char *role = get_attr(child_el, "role");
const char *cls = get_attr(child_el, "class");

// Matches Google Docs (role="checkbox") OR MS Word (class includes "checklist")
return (role && strcmp(role, "checkbox") == 0) ||
(cls && strstr(cls, "checklist") != NULL);
}
}
}

return false;
}

static void emit_attributes(GumboElement *el, const char *tag_name,
buffer_t *out) {
if (strcmp(tag_name, "a") == 0) {
Expand All @@ -421,12 +449,21 @@ static void emit_attributes(GumboElement *el, const char *tag_name,
emit_one_attr(out, el, "width");
emit_one_attr(out, el, "height");
} else if (strcmp(tag_name, "ul") == 0) {
const char *val = get_attr(el, "data-type");
if (val && strcmp(val, "checkbox") == 0)
if (is_checkbox_list(el)) {
buffer_append_str(out, " data-type=\"checkbox\"");
}
} else if (strcmp(tag_name, "li") == 0) {
if (gumbo_get_attribute(&el->attributes, "checked") != NULL)
const char *data_checked = get_attr(el, "data-checked");
const char *aria_checked = get_attr(el, "aria-checked");
const char *level_text = get_attr(el, "data-leveltext");

// "\xEF\x83\xBE" is the UTF-8 hex encoding for U+F0FE (MS Word Checked Box)
if (gumbo_get_attribute(&el->attributes, "checked") != NULL ||
(data_checked && strcmp(data_checked, "true") == 0) ||
(aria_checked && strcmp(aria_checked, "true") == 0) ||
(level_text && strcmp(level_text, "\xEF\x83\xBE") == 0)) {
buffer_append_str(out, " checked");
}
} else if (strcmp(tag_name, "mention") == 0) {
emit_one_attr(out, el, "id");
emit_one_attr(out, el, "text");
Expand Down Expand Up @@ -511,6 +548,7 @@ typedef struct {
GumboNode **nested_lists;
int *nested_count;
int max_nested;
bool has_emitted;
} li_ctx_t;

static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out,
Expand All @@ -527,6 +565,7 @@ static void flush_li_buffer(buffer_t *ib, buffer_t *out, li_ctx_t *ctx) {
emit_styles_close(out, ctx->styles);
buffer_append_str(out, "</li>");
buffer_clear(ib);
ctx->has_emitted = true;
}

static void flatten_li_children(GumboNode *node, buffer_t *ib, buffer_t *out,
Expand All @@ -551,6 +590,17 @@ static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out,
flatten_li_children(node, ib, out, ctx);
return;
}

char buf[64];
const char *tag = get_tag_name(node, buf, sizeof(buf));
if (tag && strcmp(tag, "img") == 0) {
const char *role = get_attr(ctx->el, "role");
// strip the <img> that Google Docs uses for the display of a checkbox icon
if (role && strcmp(role, "checkbox") == 0) {
return;
}
}

if (is_list_node(node)) {
if (*ctx->nested_count < ctx->max_nested) {
ctx->nested_lists[*ctx->nested_count] = node;
Expand Down Expand Up @@ -837,6 +887,14 @@ static void walk_node(GumboNode *node, buffer_t *out) {
li_ctx_t ctx = {el, es, nested_lists, &nested_count, 16};
flatten_li_children(node, &li_ib, out, &ctx);
flush_li_buffer(&li_ib, out, &ctx);

/* if nothing emitted - the <li> is empty, we add it manually */
if (!ctx.has_emitted) {
buffer_append_str(out, "<li");
emit_attributes(el, "li", out);
buffer_append_str(out, "></li>");
}

free(li_ib.data);
for (int k = 0; k < nested_count; k++)
walk_children(nested_lists[k], out);
Expand Down
54 changes: 54 additions & 0 deletions cpp/tests/GumboParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,60 @@ TEST(GumboParserTest, ListFlattening) {
"<ul><li><b>another one </b>hi kacper,</li><li>hi</li></ul>");
}

TEST(GumboParserTest, TiptapCheckboxList) {
EXPECT_EQ(
GumboParser::normalizeHtml(
"<ul data-type=\"checkboxList\"><li data-checked=\"true\" "
"data-type=\"checkboxItem\"><label><input type=\"checkbox\" "
"checked=\"checked\"><span></span></label><div><p>first</p></div></"
"li><li data-checked=\"false\" data-type=\"checkboxItem\"><label>"
"<input type=\"checkbox\"><span></span></label><div><p>second</p></"
"div></li></ul>"),
"<ul data-type=\"checkbox\"><li checked>first</li><li>second</li></ul>");
}

TEST(GumboParserTest, GoogleDocsCheckboxList) {
EXPECT_EQ(GumboParser::normalizeHtml(
"<ul><li role=\"checkbox\" aria-checked=\"true\"><img "
"src=\"data:...\" /><p>Checked</p></li><li role=\"checkbox\" "
"aria-checked=\"false\"><img src=\"data:...\" "
"/><p>Unchecked</p></li></ul>"),
"<ul data-type=\"checkbox\"><li "
"checked>Checked</li><li>Unchecked</li></ul>");
}

TEST(GumboParserTest, MSWordCheckboxList) {
// \xEF\x83\xBE is the UTF-8 hex for U+F0FE (Checked MS Word box)
// \xEF\x82\xA8 is the UTF-8 hex for U+F0A8 (Unchecked MS Word box)
EXPECT_EQ(
GumboParser::normalizeHtml(
"<ul><li class=\"OutlineElement checklist\" "
"data-leveltext=\"\xEF\x83\xBE\">Checked</li><li "
"class=\"OutlineElement "
"checklist\" data-leveltext=\"\xEF\x82\xA8\">Unchecked</li></ul>"),
"<ul data-type=\"checkbox\"><li "
"checked>Checked</li><li>Unchecked</li></ul>");
}

TEST(GumboParserTest, EmptyListItems) {
EXPECT_EQ(GumboParser::normalizeHtml("<ul><li></li><li>first</li><li></"
"li><li>second</li><li></li><li></li>"
"</ul>"),
"<ul><li></li><li>first</li><li></li><li>second</li><li></li><li></"
"li></ul>");
EXPECT_EQ(GumboParser::normalizeHtml("<ol><li></li><li>first</li><li></"
"li><li>second</li><li></li><li></li>"
"</ol>"),
"<ol><li></li><li>first</li><li></li><li>second</li><li></li><li></"
"li></ol>");
EXPECT_EQ(
GumboParser::normalizeHtml(
"<ul data-type=\"checkbox\"><li checked></li><li>first</li><li>"
"</li><li checked>second</li><li></li><li></li></ul>"),
"<ul data-type=\"checkbox\"><li checked></li><li>first</li><li></li><li "
"checked>second</li><li></li><li></li></ul>");
}

TEST(GumboParserTest, BrRemappings) {
EXPECT_EQ(GumboParser::normalizeHtml(
"<p><b>Asdasdasd</b></p><br><br><p>Sent with<span> </span><a "
Expand Down
50 changes: 50 additions & 0 deletions src/web/__tests__/htmlNormalizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,56 @@ describe('htmlNormalizer', () => {
});
});

describe('EmptyListItems', () => {
test.each([
[
'<ul><li></li><li>first</li><li></li><li>second</li><li></li><li></li></ul>',
'<ul><li></li><li>first</li><li></li><li>second</li><li></li><li></li></ul>',
],
[
'<ol><li></li><li>first</li><li></li><li>second</li><li></li><li></li></ol>',
'<ol><li></li><li>first</li><li></li><li>second</li><li></li><li></li></ol>',
],
[
'<ul data-type="checkbox"><li checked></li><li>first</li><li></li><li checked>second</li><li></li><li></li></ul>',
'<ul data-type="checkbox"><li checked></li><li>first</li><li></li><li checked>second</li><li></li><li></li></ul>',
],
])('%s → %s', (input, expected) => {
expect(normalizeHtml(input)).toBe(expected);
});
});

describe('TiptapCheckboxList', () => {
test("tiptap's internal checkbox list structure gets correctly parsed", () => {
expect(
normalizeHtml(
`<ul data-type="checkboxList"><li data-checked="true" data-type="checkboxItem"><label>` +
`<input type="checkbox" checked="checked"><span></span></label><div><p>first</p></div></li>` +
`<li data-checked="false" data-type="checkboxItem"><label><input type="checkbox"><span></span></label><div><p>second</p></div></li></ul>`
)
).toBe(
'<ul data-type="checkbox"><li checked>first</li><li>second</li></ul>'
);
});
});

describe('Checkbox Lists (Google Docs & MS Word)', () => {
test.each([
// Google Docs format
[
'<ul><li role="checkbox" aria-checked="true"><img src="data:image/png;base64,..." /><p>Checked</p></li><li role="checkbox" aria-checked="false"><img src="data:image/png;base64,..." /><p>Unchecked</p></li></ul>',
'<ul data-type="checkbox"><li checked>Checked</li><li>Unchecked</li></ul>',
],
// MS Word format
[
'<ul><li class="OutlineElement checklist" data-leveltext="\uF0FE">Checked</li><li class="OutlineElement checklist" data-leveltext="\uF0A8">Unchecked</li></ul>',
'<ul data-type="checkbox"><li checked>Checked</li><li>Unchecked</li></ul>',
],
])('%s → %s', (input, expected) => {
expect(normalizeHtml(input)).toBe(expected);
});
});

describe('BrRemappings', () => {
test('inline collapses around <br> stay flat', () => {
expect(
Expand Down
59 changes: 53 additions & 6 deletions src/web/normalization/htmlNormalizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -249,12 +249,16 @@ function emitAttributes(el: Element, name: string): string {
emitOneAttr(el, 'width') +
emitOneAttr(el, 'height')
);
case 'ul': {
const val = el.getAttribute('data-type');
return val === 'checkbox' ? ' data-type="checkbox"' : '';
}
case 'ul':
return isCheckboxList(el) ? ' data-type="checkbox"' : '';
case 'li':
return el.hasAttribute('checked') ? ' checked' : '';
// "" is U+F0FE (MS Word checked box); often encoded as "\xEF\x83\xBE" in UTF-8.
const isChecked =
el.hasAttribute('checked') ||
el.getAttribute('data-checked') === 'true' ||
el.getAttribute('aria-checked') === 'true' ||
el.getAttribute('data-leveltext') === ''; // MS Word checked box
return isChecked ? ' checked' : '';
case 'mention':
return (
emitOneAttr(el, 'id') +
Expand All @@ -266,6 +270,32 @@ function emitAttributes(el: Element, name: string): string {
}
}

function isCheckboxList(el: Element): boolean {
if (
el.getAttribute('data-type') === 'checkbox' ||
el.getAttribute('data-type') === 'checkboxList'
) {
return true;
}

// In Google Docs and MS Word the <li> elements define if it is a checkbox
// list. We only need to check the first <li>.
const firstLi = Array.from(el.children).find(
(c) => c.tagName.toLowerCase() === 'li'
);
if (firstLi) {
const role = firstLi.getAttribute('role');
const className = firstLi.getAttribute('class') || '';

// Matches Google Docs (role="checkbox") OR MS Word (class includes "checklist")
if (role === 'checkbox' || className.includes('checklist')) {
return true;
}
}

return false;
}

function isGoogleDocsWrapper(el: Element, tag: string): boolean {
if (tag !== 'b') return false;
const id = el.getAttribute('id');
Expand Down Expand Up @@ -340,6 +370,7 @@ type LiCtx = {
el: Element;
styles: CssStyles;
nestedLists: Element[];
hasEmitted: boolean;
};

function flushLiBuffer(
Expand All @@ -354,6 +385,7 @@ function flushLiBuffer(
out.buf += emitStylesClose(ctx.styles);
out.buf += '</li>';
ib.buf = '';
ctx.hasEmitted = true;
}

function flattenLiChildren(
Expand All @@ -378,6 +410,15 @@ function flattenLiNode(
return;
}
if (!isElement(node)) return;

if (tagName(node) === 'img') {
const role = ctx.el.getAttribute('role');
// strip the <img> that Google Docs uses for the display of a checkbox icon
if (role === 'checkbox') {
return;
}
}
Comment thread
hejsztynx marked this conversation as resolved.

if (isListNode(node)) {
ctx.nestedLists.push(node);
return;
Expand Down Expand Up @@ -573,9 +614,15 @@ function walkNode(node: Node, out: { buf: string }): void {
if (outName === 'li') {
const nestedLists: Element[] = [];
const liIb = { buf: '' };
const ctx: LiCtx = { el: node, styles: es, nestedLists };
const ctx: LiCtx = { el: node, styles: es, nestedLists, hasEmitted: false };
flattenLiChildren(node, liIb, out, ctx);
flushLiBuffer(liIb, out, ctx);

// if nothing emitted - the <li> is empty, we add it manually
if (!ctx.hasEmitted) {
out.buf += `<li${emitAttributes(ctx.el, 'li')}></li>`;
}

for (const nl of nestedLists) walkChildren(nl, out);
return;
}
Expand Down
Loading