diff --git a/cpp/parser/GumboNormalizer.c b/cpp/parser/GumboNormalizer.c index c4ece832b..c9bea6b27 100644 --- a/cpp/parser/GumboNormalizer.c +++ b/cpp/parser/GumboNormalizer.c @@ -411,6 +411,34 @@ static void emit_one_attr(buffer_t *out, GumboElement *el, } } +static bool is_checkbox_list(GumboElement *el) { + const char *val = get_attr(el, "data-type"); + if (val && (strcmp(val, "checkbox") == 0 || strcmp(val, "checkboxList") == 0)) { + return true; + } + + // In Google Docs and MS Word the
  • elements define if it is a checkbox + // list. We only need to check the first
  • . + GumboVector *children = &el->children; + for (unsigned int i = 0; i < children->length; i++) { + GumboNode *child = children->data[i]; + if (is_element(child)) { + char child_tag[64]; + if (get_tag_name(child, child_tag, sizeof(child_tag)) && strcmp(child_tag, "li") == 0) { + GumboElement *child_el = &child->v.element; + const char *role = get_attr(child_el, "role"); + const char *cls = get_attr(child_el, "class"); + + // Matches Google Docs (role="checkbox") OR MS Word (class includes "checklist") + return (role && strcmp(role, "checkbox") == 0) || + (cls && strstr(cls, "checklist") != NULL); + } + } + } + + return false; +} + static void emit_attributes(GumboElement *el, const char *tag_name, buffer_t *out) { if (strcmp(tag_name, "a") == 0) { @@ -421,12 +449,21 @@ static void emit_attributes(GumboElement *el, const char *tag_name, emit_one_attr(out, el, "width"); emit_one_attr(out, el, "height"); } else if (strcmp(tag_name, "ul") == 0) { - const char *val = get_attr(el, "data-type"); - if (val && strcmp(val, "checkbox") == 0) + if (is_checkbox_list(el)) { buffer_append_str(out, " data-type=\"checkbox\""); + } } else if (strcmp(tag_name, "li") == 0) { - if (gumbo_get_attribute(&el->attributes, "checked") != NULL) + const char *data_checked = get_attr(el, "data-checked"); + const char *aria_checked = get_attr(el, "aria-checked"); + const char *level_text = get_attr(el, "data-leveltext"); + + // "\xEF\x83\xBE" is the UTF-8 hex encoding for U+F0FE (MS Word Checked Box) + if (gumbo_get_attribute(&el->attributes, "checked") != NULL || + (data_checked && strcmp(data_checked, "true") == 0) || + (aria_checked && strcmp(aria_checked, "true") == 0) || + (level_text && strcmp(level_text, "\xEF\x83\xBE") == 0)) { buffer_append_str(out, " checked"); + } } else if (strcmp(tag_name, "mention") == 0) { emit_one_attr(out, el, "id"); emit_one_attr(out, el, "text"); @@ -511,6 +548,7 @@ typedef struct { GumboNode **nested_lists; int *nested_count; int max_nested; + bool has_emitted; } li_ctx_t; static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out, @@ -527,6 +565,7 @@ static void flush_li_buffer(buffer_t *ib, buffer_t *out, li_ctx_t *ctx) { emit_styles_close(out, ctx->styles); buffer_append_str(out, "
  • "); buffer_clear(ib); + ctx->has_emitted = true; } static void flatten_li_children(GumboNode *node, buffer_t *ib, buffer_t *out, @@ -551,6 +590,17 @@ static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out, flatten_li_children(node, ib, out, ctx); return; } + + char buf[64]; + const char *tag = get_tag_name(node, buf, sizeof(buf)); + if (tag && strcmp(tag, "img") == 0) { + const char *role = get_attr(ctx->el, "role"); + // strip the that Google Docs uses for the display of a checkbox icon + if (role && strcmp(role, "checkbox") == 0) { + return; + } + } + if (is_list_node(node)) { if (*ctx->nested_count < ctx->max_nested) { ctx->nested_lists[*ctx->nested_count] = node; @@ -837,6 +887,14 @@ static void walk_node(GumboNode *node, buffer_t *out) { li_ctx_t ctx = {el, es, nested_lists, &nested_count, 16}; flatten_li_children(node, &li_ib, out, &ctx); flush_li_buffer(&li_ib, out, &ctx); + + /* if nothing emitted - the
  • is empty, we add it manually */ + if (!ctx.has_emitted) { + buffer_append_str(out, "
  • "); + } + free(li_ib.data); for (int k = 0; k < nested_count; k++) walk_children(nested_lists[k], out); diff --git a/cpp/tests/GumboParserTest.cpp b/cpp/tests/GumboParserTest.cpp index f45b7ec93..124e44642 100644 --- a/cpp/tests/GumboParserTest.cpp +++ b/cpp/tests/GumboParserTest.cpp @@ -448,6 +448,60 @@ TEST(GumboParserTest, ListFlattening) { ""); } +TEST(GumboParserTest, TiptapCheckboxList) { + EXPECT_EQ( + GumboParser::normalizeHtml( + ""), + ""); +} + +TEST(GumboParserTest, GoogleDocsCheckboxList) { + EXPECT_EQ(GumboParser::normalizeHtml( + ""), + ""); +} + +TEST(GumboParserTest, MSWordCheckboxList) { + // \xEF\x83\xBE is the UTF-8 hex for U+F0FE (Checked MS Word box) + // \xEF\x82\xA8 is the UTF-8 hex for U+F0A8 (Unchecked MS Word box) + EXPECT_EQ( + GumboParser::normalizeHtml( + ""), + ""); +} + +TEST(GumboParserTest, EmptyListItems) { + EXPECT_EQ(GumboParser::normalizeHtml(""), + ""); + EXPECT_EQ(GumboParser::normalizeHtml("
    1. first
    2. second
    3. " + "
    "), + "
    1. first
    2. second
    "); + EXPECT_EQ( + GumboParser::normalizeHtml( + ""), + ""); +} + TEST(GumboParserTest, BrRemappings) { EXPECT_EQ(GumboParser::normalizeHtml( "

    Asdasdasd



    Sent with { }); }); + describe('EmptyListItems', () => { + test.each([ + [ + '

    ', + '', + ], + [ + '
    1. first
    2. second
    ', + '
    1. first
    2. second
    ', + ], + [ + '', + '', + ], + ])('%s → %s', (input, expected) => { + expect(normalizeHtml(input)).toBe(expected); + }); + }); + + describe('TiptapCheckboxList', () => { + test("tiptap's internal checkbox list structure gets correctly parsed", () => { + expect( + normalizeHtml( + `` + ) + ).toBe( + '' + ); + }); + }); + + describe('Checkbox Lists (Google Docs & MS Word)', () => { + test.each([ + // Google Docs format + [ + '', + '', + ], + // MS Word format + [ + '', + '', + ], + ])('%s → %s', (input, expected) => { + expect(normalizeHtml(input)).toBe(expected); + }); + }); + describe('BrRemappings', () => { test('inline collapses around
    stay flat', () => { expect( diff --git a/src/web/normalization/htmlNormalizer.ts b/src/web/normalization/htmlNormalizer.ts index be34e44f9..670f40959 100644 --- a/src/web/normalization/htmlNormalizer.ts +++ b/src/web/normalization/htmlNormalizer.ts @@ -249,12 +249,16 @@ function emitAttributes(el: Element, name: string): string { emitOneAttr(el, 'width') + emitOneAttr(el, 'height') ); - case 'ul': { - const val = el.getAttribute('data-type'); - return val === 'checkbox' ? ' data-type="checkbox"' : ''; - } + case 'ul': + return isCheckboxList(el) ? ' data-type="checkbox"' : ''; case 'li': - return el.hasAttribute('checked') ? ' checked' : ''; + // "" is U+F0FE (MS Word checked box); often encoded as "\xEF\x83\xBE" in UTF-8. + const isChecked = + el.hasAttribute('checked') || + el.getAttribute('data-checked') === 'true' || + el.getAttribute('aria-checked') === 'true' || + el.getAttribute('data-leveltext') === ''; // MS Word checked box + return isChecked ? ' checked' : ''; case 'mention': return ( emitOneAttr(el, 'id') + @@ -266,6 +270,32 @@ function emitAttributes(el: Element, name: string): string { } } +function isCheckboxList(el: Element): boolean { + if ( + el.getAttribute('data-type') === 'checkbox' || + el.getAttribute('data-type') === 'checkboxList' + ) { + return true; + } + + // In Google Docs and MS Word the
  • elements define if it is a checkbox + // list. We only need to check the first
  • . + const firstLi = Array.from(el.children).find( + (c) => c.tagName.toLowerCase() === 'li' + ); + if (firstLi) { + const role = firstLi.getAttribute('role'); + const className = firstLi.getAttribute('class') || ''; + + // Matches Google Docs (role="checkbox") OR MS Word (class includes "checklist") + if (role === 'checkbox' || className.includes('checklist')) { + return true; + } + } + + return false; +} + function isGoogleDocsWrapper(el: Element, tag: string): boolean { if (tag !== 'b') return false; const id = el.getAttribute('id'); @@ -340,6 +370,7 @@ type LiCtx = { el: Element; styles: CssStyles; nestedLists: Element[]; + hasEmitted: boolean; }; function flushLiBuffer( @@ -354,6 +385,7 @@ function flushLiBuffer( out.buf += emitStylesClose(ctx.styles); out.buf += '
  • '; ib.buf = ''; + ctx.hasEmitted = true; } function flattenLiChildren( @@ -378,6 +410,15 @@ function flattenLiNode( return; } if (!isElement(node)) return; + + if (tagName(node) === 'img') { + const role = ctx.el.getAttribute('role'); + // strip the that Google Docs uses for the display of a checkbox icon + if (role === 'checkbox') { + return; + } + } + if (isListNode(node)) { ctx.nestedLists.push(node); return; @@ -573,9 +614,15 @@ function walkNode(node: Node, out: { buf: string }): void { if (outName === 'li') { const nestedLists: Element[] = []; const liIb = { buf: '' }; - const ctx: LiCtx = { el: node, styles: es, nestedLists }; + const ctx: LiCtx = { el: node, styles: es, nestedLists, hasEmitted: false }; flattenLiChildren(node, liIb, out, ctx); flushLiBuffer(liIb, out, ctx); + + // if nothing emitted - the
  • is empty, we add it manually + if (!ctx.hasEmitted) { + out.buf += `
  • `; + } + for (const nl of nestedLists) walkChildren(nl, out); return; }