Neamar · YunusEmreAlps · Jan 18, 2023 · Feb 20, 2026 · Feb 20, 2026
diff --git a/README.md b/README.md
@@ -1,46 +1,106 @@
-Content aware document Highlighter
-=======================
-![Build Status](https://travis-ci.org/Neamar/document-highlighter.png)
-![Coverage Status](https://coveralls.io/repos/Neamar/document-highlighter/badge.png?branch=master)
+# Hylite
+
+*Hylite*, a.k.a. `document highlighter`, is a JavaScript library to add highlight to a *raw / HTML document* for the specified query. It handles unicode, stop-words and punctuation, and generates HTML-compliant highlights, even for complex markup.
+
+> The name is a combination of "highlight" and "lite", as in "lightweight". It also sounds like "high light", which is what the library does: it highlights text in a document.
+
+## Table of Contents
+
+- [Hylite](#hylite)
+  - [Table of Contents](#table-of-contents)
+  - [Installation](#installation)
+  - [Samples](#samples)
+    - [Plain Text](#plain-text)
+    - [Stopwords](#stopwords)
+    - [HTML](#html)
+  - [Usage](#usage)
+    - [Highlight plain text documents](#highlight-plain-text-documents)
+    - [Highlight HTML documents](#highlight-html-documents)
+    - [Customize highlight markup](#customize-highlight-markup)
+  - [Status](#status)
+  - [Supported languages](#supported-languages)
+  - [Contributing](#contributing)
+
+## Installation
+
+```bash
+# Move to your workspace
+cd <your-workspace>
+
+# Clone this project into your workspace
+git clone <repository-url>
+
+# Move to the project root directory
+cd hylite
 
-## What is `document highlighter`?
-Add highlight to a raw / HTML document for the specified query. Handle unicode, stop-words and punctuation.
-Generate HTML-compliant highlights, even for complex markup.
+# Open the project in your favorite IDE
+code . # For Visual Studio Code
+
+# Install dependencies and run tests
+npm install
+
+# Run tests
+npx mocha test/index.js test/highlight.js test/real.js test/turkish.js
+```
 
 ## Samples
-### Plain text
-#### Simple case
-The following text :
 
-> The index analysis module acts as a configurable registry of Analyzers that can be used in order to both break indexed (analyzed) fields when a document is indexed and process query strings. It maps to the Lucene Analyzer.
+### Plain Text
+
+```plain
+The index analysis module acts as a configurable registry of Analyzers that can be used in order to both break indexed (analyzed) fields when a document is indexed and process query strings. It maps to the Lucene Analyzer.
+```
 
 When highlighted for the query `The index analysis string` will become:
 
-> **The index analysis** module acts as a configurable registry of Analyzers that can be used in order to both break indexed (analyzed) fields when a document is indexed and process query **strings**. It maps to the Lucene Analyzer.
+```plain
+**The index analysis** module acts as a configurable registry of Analyzers that can be used in order to both break indexed (analyzed) fields when a document is indexed and process query **strings**. It maps to the Lucene Analyzer.
+```
+
+> Note generated markup is minimal (one item per match, and not one item per word).
 
-Note generated markup is minimal (one item per match, and not one item per word).
+### Stopwords
 
-#### Stopwords
-Document highlighter handles stopwords and punctuation according to the language specified. For instance, the following text:
+Document highlighter handles *stopwords* and *punctuation* according to the language specified. For instance, the following text:
 
-> Install this library, and start using it.
+```plain
+Install this library, and start using it.
+```
 
 When highlighted for the query `install library` will become:
 
-> **Install this library**, and start using it.
+```plain
+**Install this library**, and start using it.
+```
 
 ### HTML
-This also works for HTML documents, e.g. :
 
-> This document contains _italics_ and stuff.
+This also works for *HTML* documents, e.g. :
+
+```html
+This document contains _italics_ and stuff.
+```
 
 When highlighted for the query `it contains some italic empty` will become:
-> This document **contains _italics_** and stuff.
 
-Document highlighter maintains original markup and add wrapping tags as needed.
+```html
+This document **contains _italics_** and stuff.
+```
+
+> Document highlighter maintains original markup and add wrapping tags as needed.
 
 ## Usage
+
+> First, require the library:
+
+```bash
+npm install document-highlighter
+```
+
+Then, you can use it as follows:
+
 ### Highlight plain text documents
+
 ```javascript
 var highlighter = require('document-highlighter');
 
@@ -60,6 +120,7 @@ console.log(hl.indices);
 ```
 
 ### Highlight HTML documents
+
 ```javascript
 var highlighter = require('document-highlighter');
 
@@ -76,6 +137,7 @@ console.log(hl.text);
 ```
 
 ### Customize highlight markup
+
 ```javascript
 var highlighter = require('document-highlighter');
 
@@ -109,3 +171,20 @@ var hl = highlighter.text(
 console.log(hl.text);
 // "In JavaScript, you can define a <span class="hlt">callback handler in</span> regex string replace <span class="hlt">operations</span>"
 ```
+
+## Status
+
+![Build Status](https://travis-ci.org/Neamar/document-highlighter.png)
+![Coverage Status](https://coveralls.io/repos/Neamar/document-highlighter/badge.png?branch=master)
+
+## Supported languages
+
+Document highlighter supports the following languages:
+
+- English
+- French
+- Turkish
+
+## Contributing
+
+Contributions are welcome! Please open an issue or submit a pull request.
diff --git a/lib/helpers/normalize.js b/lib/helpers/normalize.js
@@ -39,16 +39,66 @@ var normalizePunctuation = function normalizePunctuation(regexpToken) {
  * Replace accented characters in the query by their latin equivalent
  */
 var normalizeUnicode = function normalizeUnicode(regexpToken, languageData) {
-  for(var letter in languageData.unicode || {}) {
-    var letterMutations = languageData.unicode[letter].slice(0);
-    letterMutations.unshift(letter);
-
-    var charClass = '(?:' + letterMutations.join('|') + ')';
+  var processed = {};
+  var replacements = [];
 
-    // Search for pattern charClass and replace it... with charClass, i.e. "e" becomes (é|è|e), and so do é.
-    regexpToken = regexpToken.replace(new RegExp(charClass, 'g'), charClass);
+  // Build character groups - find all characters that should be equivalent
+  for(var letter in languageData.unicode || {}) {
+    if(processed[letter]) continue;
+
+    // Collect all variations for this letter
+    var group = [letter];
+    var variations = languageData.unicode[letter] || [];
+
+    variations.forEach(function(variant) {
+      if(group.indexOf(variant) === -1) {
+        group.push(variant);
+      }
+    });
+
+    // Also check if any of these variants have their own mappings
+    var groupCopy = group.slice(0);
+    groupCopy.forEach(function(char) {
+      if(languageData.unicode[char]) {
+        languageData.unicode[char].forEach(function(v) {
+          if(group.indexOf(v) === -1) {
+            group.push(v);
+          }
+        });
+      }
+    });
+
+    var charClass = '(?:' + group.join('|') + ')';
+
+    // Store replacement info for each character in the group
+    group.forEach(function(char) {
+      if(!processed[char]) {
+        replacements.push({
+          char: char,
+          charClass: charClass
+        });
+        processed[char] = true;
+      }
+    });
   }
 
+  // Use placeholders to prevent nested replacements
+  var placeholders = [];
+  replacements.forEach(function(repl, index) {
+    var placeholder = '\x00' + index + '\x00';
+    var escapedChar = repl.char.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+    regexpToken = regexpToken.replace(new RegExp(escapedChar, 'g'), placeholder);    placeholders.push({
+      placeholder: placeholder,
+      charClass: repl.charClass
+    });
+  });
+
+  // Replace placeholders with actual char classes
+  placeholders.forEach(function(ph) {
+    var escapedPlaceholder = ph.placeholder.replace(/\x00/g, '\\x00');
+    regexpToken = regexpToken.replace(new RegExp(escapedPlaceholder, 'g'), ph.charClass);
+  });
+
   return regexpToken;
 };
 

diff --git a/lib/languages.json b/lib/languages.json
@@ -19,21 +19,33 @@
   },
   "tr": {
     "unicode": {
-      "o": ["ö"],
-      "u": ["ü"],
-      "i": ["ı"],
-      "g": ["ğ"],
-      "c": ["ç"],
-      "s": ["ş"],
-      "O": ["Ö"],
-      "U": ["Ü"],
-      "I": ["İ", "I"],
-      "G": ["Ğ"],
-      "C": ["Ç"],
-      "S": ["Ş"]
+      "o": ["ö", "O", "Ö"],
+      "ö": ["o", "O", "Ö"],
+      "u": ["ü", "U", "Ü"],
+      "ü": ["u", "U", "Ü"],
+      "i": ["ı", "İ", "I"],
+      "ı": ["i", "İ", "I"],
+      "g": ["ğ", "G", "Ğ"],
+      "ğ": ["g", "G", "Ğ"],
+      "c": ["ç", "C", "Ç"],
+      "ç": ["c", "C", "Ç"],
+      "s": ["ş", "S", "Ş"],
+      "ş": ["s", "S", "Ş"],
+      "O": ["ö", "o", "Ö"],
+      "Ö": ["o", "ö", "O"],
+      "U": ["ü", "u", "Ü"],
+      "Ü": ["u", "ü", "U"],
+      "İ": ["ı", "i", "I"],
+      "I": ["ı", "i", "İ"],
+      "G": ["ğ", "g", "Ğ"],
+      "Ğ": ["g", "ğ", "G"],
+      "C": ["ç", "c", "Ç"],
+      "Ç": ["c", "ç", "C"],
+      "S": ["ş", "s", "Ş"],
+      "Ş": ["s", "ş", "S"]
     },
-    "alpha_range": "a-zöüığçşÖÜİĞÇŞ",
-    "suffix":["s"],
+    "alpha_range": "a-zA-ZöüiığçşÖÜİIĞÇŞ",
+    "suffix": ["s", "lar", "ler", "lık", "lik", "luk", "lük", "da", "de", "ta", "te"],
     "stopwords": ["acaba", "ama", "aslında", "az", "bazı", "belki", "biri", "birkaç", "birşey", "biz", "bu", "çok", "çünkü", "da", "daha", "de", "defa", "diye", "eğer", "en", "gibi", "hem", "hep", "hepsi", "her", "hiç", "için", "ile", "ise", "kez", "ki", "kim", "mı", "mu", "mü", "nasıl", "ne", "neden", "nerde", "nerede", "nereye", "niçin", "niye", "o", "sanki", "şey", "şu", "tüm", "ve", "veya", "ya", "yani"]
   }
 }