diff --git a/src/dompa/coordinates.cljc b/src/dompa/coordinates.cljc index e5ca5e8..2388fcd 100644 --- a/src/dompa/coordinates.cljc +++ b/src/dompa/coordinates.cljc @@ -40,6 +40,13 @@ :start-idx idx :coordinates (conj coordinates [start-idx idx])} + ; new tag starts while we were parsing another tag, + ; handles void elements + (and (= :tag char-type) (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates (conj coordinates [start-idx (dec idx)])} + ; otherwise don't record anything, just note ; the start of a tag (tag-starts? c char-type) @@ -95,62 +102,69 @@ (apply str)) value))) -(defn- name-coordinates-fn - "Returns a function with the initial state of an `html` - string, to be used to construct a sequence of `[index, name]`. " - [html] - (fn [idx coordinate] - [idx (coordinates->tag-name html coordinate)])) - -(defn- last-by-tag-name-idx - "Gets the last coordinate matching the tag `name` that occurred - before `start`, for finding coordinates that should be merged - together." - [html coordinates name start] - (let [filter-fn (fn [[_ end]] (< end start)) - filtered-coordinates (filter filter-fn coordinates) - index-fn (name-coordinates-fn html) - named-coordinates (map-indexed index-fn filtered-coordinates)] - (->> named-coordinates - (filter #(= name (-> % last))) - last - first))) - -(defn- unify-one - [html coordinates [start end]] - (let [name (coordinates->tag-name html [start end]) - matching-idx (last-by-tag-name-idx html coordinates name start)] - (if matching-idx - (let [[matching-start] (nth coordinates matching-idx)] - (assoc coordinates matching-idx [matching-start end])) - coordinates))) - -(defn- unify-reducer-fn - "Returns a reducer function with the initial state of - a `html` string." - [html] - (fn [coordinates [start end]] - (if (and (= \< (nth html start)) - (= \/ (nth html (inc start) nil))) - (unify-one html coordinates [start end]) - (conj coordinates [start end])))) +(defn- coordinate-info + "Determines if a coordinate is an opening tag, closing tag, or text." + [html [start end]] + (let [value (subs html start (inc end))] + (cond + (str/starts-with? value "tag-name html [start end])} + + (str/starts-with? value "<") + {:coord-type :opening, :coord-name (coordinates->tag-name html [start end])} + + :else + {:coord-type :text, :coord-name :dompa/text}))) + +(def ^:private void-elements + #{"area" "base" "br" "col" "embed" "hr" "img" + "input" "link" "meta" "param" "source" "track" "wbr"}) + +(defn- handle-opening-tag [{:keys [stack unified coord coord-name start]}] + (if (void-elements coord-name) + {:stack stack + :unified (conj unified coord)} + {:stack (conj stack {:name coord-name :start start}) + :unified unified})) + +(defn- handle-closing-tag [{:keys [stack unified coord-name end]}] + (if-let [last-open (peek stack)] + (if (= coord-name (:name last-open)) + {:stack (pop stack) + :unified (conj unified [(:start last-open) end])} + {:stack stack :unified unified}) + {:stack stack :unified unified})) + +(defn- unify-reducer-fn [html] + (fn [{:keys [stack unified]} [start end :as coord]] + (let [{:keys [coord-type coord-name]} (coordinate-info html coord)] + (cond + (= coord-type :opening) + (handle-opening-tag {:stack stack + :unified unified + :coord coord + :coord-name coord-name + :start start}) + + (= coord-type :closing) + (handle-closing-tag {:stack stack + :unified unified + :coord-name coord-name + :end end}) + + :else + {:stack stack + :unified (conj unified coord)})))) (defn unify "Joins together given `coordinates` that represent - one HTML node in `html`, without which `html` such as: - - ```html -
hello
- ``` - - would result in 3 nodes (div, text, div), instead of 2 (div, text), - because non-unified coordinates are blind to the context - in which they live, having only had one pass over the - raw HTML string which composes the initial coordinates." + one HTML node in `html`, using a stack-based approach to correctly + handle nested and void tags." [{:keys [html coordinates]}] - {:html html - :coordinates (-> (unify-reducer-fn html) - (reduce [] coordinates))}) + (let [initial-state {:stack [], :unified []} + result (reduce (unify-reducer-fn html) initial-state coordinates)] + {:html html + :coordinates (sort-by first (:unified result))})) (defn- children "Returns all the coordinates that belong between the given diff --git a/src/dompa/nodes.cljc b/src/dompa/nodes.cljc index 8600026..9dbecd0 100644 --- a/src/dompa/nodes.cljc +++ b/src/dompa/nodes.cljc @@ -1,7 +1,7 @@ (ns dompa.nodes) (def ^:private default-void-nodes - #{:!doctype :area :base :br :col :embed :hr :img :input + #{:!doctype :!DOCTYPE :area :base :br :col :embed :hr :img :input :link :meta :source :track :wbr}) (defn- node-attrs-reducer [attrs k v] diff --git a/test/dompa/coordinates_test.cljc b/test/dompa/coordinates_test.cljc index 1a69114..9b05826 100644 --- a/test/dompa/coordinates_test.cljc +++ b/test/dompa/coordinates_test.cljc @@ -66,13 +66,13 @@ (testing "Unify coordinates with invalid HTML" (let [html "
hello"] (is (= {:html html - :coordinates [[0 4] [5 9]]} + :coordinates [[5 9]]} (-> (coordinates/compose html) coordinates/unify)))) (let [html "
hello"] (is (= {:html html - :coordinates [[0 4] [5 9]]} + :coordinates [[5 9]]} (-> (coordinates/compose html) coordinates/unify)))))