diff --git a/CHANGELOG.md b/CHANGELOG.md index d231b46..407d907 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,15 @@ # go-utils + +## [v1.42.0] - 2025-10-22 +### New Features +- identifier tanya + + -## [v1.41.0] - 2025-09-11 +## [v1.41.0] - 2025-09-17 ### New Features -- reverse slice +- reverse slice ([#71](https://github.com/kumparan/go-utils/issues/71)) @@ -207,6 +213,9 @@ - fix marshal issue on gorm.DeletedAt empty value ([#32](https://github.com/kumparan/go-utils/issues/32)) + +## [v.1.20.0] - 2022-03-11 + ## [v1.20.0] - 2022-03-11 ### New Features @@ -306,11 +315,11 @@ - add money formatter for multiple currencies ([#13](https://github.com/kumparan/go-utils/issues/13)) - -## [v1.8.0] - 2020-12-10 - ## [v1.7.1] - 2020-12-10 + + +## [v1.8.0] - 2020-12-10 ### New Features - add formatter for indonesian money and date @@ -375,7 +384,8 @@ - init go-utils -[Unreleased]: https://github.com/kumparan/go-utils/compare/v1.41.0...HEAD +[Unreleased]: https://github.com/kumparan/go-utils/compare/v1.42.0...HEAD +[v1.42.0]: https://github.com/kumparan/go-utils/compare/v1.41.0...v1.42.0 [v1.41.0]: https://github.com/kumparan/go-utils/compare/v1.40.2...v1.41.0 [v1.40.2]: https://github.com/kumparan/go-utils/compare/v1.40.1...v1.40.2 [v1.40.1]: https://github.com/kumparan/go-utils/compare/v1.40.0...v1.40.1 @@ -408,7 +418,8 @@ [v1.23.0]: https://github.com/kumparan/go-utils/compare/v1.22.0...v1.23.0 [v1.22.0]: https://github.com/kumparan/go-utils/compare/v1.21.0...v1.22.0 [v1.21.0]: https://github.com/kumparan/go-utils/compare/v1.20.1...v1.21.0 -[v1.20.1]: https://github.com/kumparan/go-utils/compare/v1.20.0...v1.20.1 +[v1.20.1]: https://github.com/kumparan/go-utils/compare/v.1.20.0...v1.20.1 +[v.1.20.0]: https://github.com/kumparan/go-utils/compare/v1.20.0...v.1.20.0 [v1.20.0]: https://github.com/kumparan/go-utils/compare/v1.19.3...v1.20.0 [v1.19.3]: https://github.com/kumparan/go-utils/compare/v1.19.2...v1.19.3 [v1.19.2]: https://github.com/kumparan/go-utils/compare/v1.19.1...v1.19.2 @@ -425,9 +436,9 @@ [v1.12.0]: https://github.com/kumparan/go-utils/compare/v1.11.0...v1.12.0 [v1.11.0]: https://github.com/kumparan/go-utils/compare/v1.10.0...v1.11.0 [v1.10.0]: https://github.com/kumparan/go-utils/compare/v1.9.0...v1.10.0 -[v1.9.0]: https://github.com/kumparan/go-utils/compare/v1.8.0...v1.9.0 -[v1.8.0]: https://github.com/kumparan/go-utils/compare/v1.7.1...v1.8.0 -[v1.7.1]: https://github.com/kumparan/go-utils/compare/v1.7.0...v1.7.1 +[v1.9.0]: https://github.com/kumparan/go-utils/compare/v1.7.1...v1.9.0 +[v1.7.1]: https://github.com/kumparan/go-utils/compare/v1.8.0...v1.7.1 +[v1.8.0]: https://github.com/kumparan/go-utils/compare/v1.7.0...v1.8.0 [v1.7.0]: https://github.com/kumparan/go-utils/compare/v1.6.0...v1.7.0 [v1.6.0]: https://github.com/kumparan/go-utils/compare/v1.5.0...v1.6.0 [v1.5.0]: https://github.com/kumparan/go-utils/compare/v1.4.0...v1.5.0 diff --git a/tanya/specs.go b/tanya/specs.go new file mode 100644 index 0000000..8d294bb --- /dev/null +++ b/tanya/specs.go @@ -0,0 +1,138 @@ +package tanya + +type ( + // Intent is the intent of a query + Intent string + // MatchType is the type of match + MatchType string +) + +// Intent and MatchType constants +const ( + IntentUpdate Intent = "update" + IntentExplain Intent = "explain" + IntentHowTo Intent = "how_to" + IntentDefinition Intent = "definition" + IntentComparison Intent = "comparison" + IntentRecommendation Intent = "recommendation" + IntentTroubleshoot Intent = "troubleshoot" + IntentLocation Intent = "location" + IntentTime Intent = "time" + IntentPrice Intent = "price" + IntentContact Intent = "contact" + IntentQuestion Intent = "question" // general fallback + IntentOther Intent = "other" + + MatchTypeContains MatchType = "contains" + MatchTypeStarts MatchType = "starts" + MatchTypeEnds MatchType = "ends" + MatchTypeTokenSuffix MatchType = "token_suffix" // nolint:gosec +) + +type ( + // Rule is a rule for matching a query to intent + Rule struct { + Terms []string + Weight int + MatchType MatchType + MinTokenLen int // optional: for token_suffix; <=0 => default 4 + } + + // IntentSpec is a specification for intent + IntentSpec struct { + Intent Intent + Priority int + Rules []Rule + } +) + +func terms(ss ...string) []string { return ss } + +var intentTable = []IntentSpec{ + {IntentUpdate, 95, []Rule{ + {terms("update", "perkembangan", "terbaru", "terkini", "progress", "lanjutan", "pembaruan"), 3, MatchTypeContains, 0}, + {terms("hari ini", "sekarang", "terkini banget"), 1, MatchTypeContains, 0}, + }}, + {IntentExplain, 90, []Rule{ + {terms("jelaskan", "jelasin", "penjelasan", "uraikan", "explain"), 3, MatchTypeContains, 0}, + {terms("arti", "artinya", "maksud", "makna", "definisi"), 2, MatchTypeContains, 0}, + }}, + {IntentHowTo, 80, []Rule{ + {terms("bagaimana cara ", "gimana cara "), 3, MatchTypeStarts, 0}, + {terms("cara "), 3, MatchTypeStarts, 0}, + {terms(" cara ", " langkah ", " step "), 1, MatchTypeContains, 0}, + {terms("resep "), 3, MatchTypeStarts, 0}, + {terms(" panduan ", "panduan "), 2, MatchTypeContains, 0}, + {terms(" tutorial ", "tutorial "), 2, MatchTypeContains, 0}, + }}, + {IntentDefinition, 75, []Rule{ + {terms("apa itu "), 3, MatchTypeStarts, 0}, + {terms("apa arti", "apa maksud"), 2, MatchTypeContains, 0}, + }}, + {IntentComparison, 70, []Rule{ + {terms(" vs ", " versus "), 2, MatchTypeContains, 0}, + {terms("perbedaan ", "beda "), 2, MatchTypeContains, 0}, + {terms("bagusan mana", "lebih bagus mana", "pilih mana"), 2, MatchTypeContains, 0}, + }}, + {IntentRecommendation, 65, []Rule{ + {terms("rekomendasi", "rekom", "saran"), 2, MatchTypeContains, 0}, + {terms("bagusan mana", "pilih mana", "cocok yang mana"), 2, MatchTypeContains, 0}, + {terms("menu "), 2, MatchTypeStarts, 0}, + {terms(" ide ", "ide "), 1, MatchTypeContains, 0}, + }}, + {IntentTroubleshoot, 60, []Rule{ + {terms("kenapa", "mengapa"), 2, MatchTypeContains, 0}, + {terms("kok "), 2, MatchTypeStarts, 0}, + {terms("error", "gagal", "bug", "crash", "macet", "hang"), 2, MatchTypeContains, 0}, + {terms("solusi ", "fix ", "gimana sih", "kenapa sih"), 1, MatchTypeContains, 0}, + }}, + {IntentLocation, 55, []Rule{ + {terms("dimana", "di mana", "kemana", "ke mana", "lokasi", "alamat"), 2, MatchTypeContains, 0}, + {terms(" kemana", " dimana", "di mana", " alamat", " lokasi"), 2, MatchTypeEnds, 0}, + }}, + {IntentTime, 55, []Rule{ + {terms("kapan", "jadwal", "jam berapa", "pukul berapa"), 2, MatchTypeContains, 0}, + {terms("hari ini", "minggu ini", "sekarang", "besok", "nanti sore", "malam ini"), 1, MatchTypeContains, 0}, + }}, + {IntentPrice, 50, []Rule{ + {terms("harga", "biaya", "tarif", "fee", "ongkir"), 2, MatchTypeContains, 0}, + }}, + {IntentContact, 50, []Rule{ + {terms("kontak", "contact", "telepon", "telp", "nomor", "email", "whatsapp", "wa"), 2, MatchTypeContains, 0}, + }}, + // fallback tanya umum + {IntentQuestion, 10, []Rule{ + {terms("apa", "apakah", "bagaimana", "gimana", "kapan", "siapa", "dimana", "di mana", "kemana", "ke mana", "berapa"), 2, MatchTypeContains, 0}, + {terms(" vs ", " versus "), 1, MatchTypeContains, 0}, + {terms(" yang mana "), 2, MatchTypeContains, 0}, + {terms("yang mana "), 2, MatchTypeStarts, 0}, + {terms(" mana"), 2, MatchTypeEnds, 0}, + {terms("kah"), 1, MatchTypeTokenSuffix, 5}, + {terms("ya ga sih", "ya gak sih", "ya nggak sih", "ya kan", "apa sih", "gimana sih", "kenapa sih"), 2, MatchTypeContains, 0}, + {terms(" kok "), 2, MatchTypeContains, 0}, + {terms("?"), 3, MatchTypeContains, 0}, + }}, +} + +var abbrevMap = map[string]string{ + "gmn": "gimana", + "gmna": "gimana", + "bgmn": "bagaimana", + "knp": "kenapa", + "knpa": "kenapa", + "dmn": "di mana", + "dmna": "di mana", + "dimn": "di mana", + "kmn": "ke mana", + "kmna": "ke mana", + "brp": "berapa", + "brpa": "berapa", + "kpn": "kapan", + "kpan": "kapan", + "sapa": "siapa", + "sp": "siapa", + "syp": "siapa", + "sypa": "siapa", + "apkh": "apakah", + "apakh": "apakah", +} diff --git a/tanya/tanya.go b/tanya/tanya.go new file mode 100644 index 0000000..e635af0 --- /dev/null +++ b/tanya/tanya.go @@ -0,0 +1,173 @@ +package tanya + +import ( + "sort" + "strings" + "unicode" + "unicode/utf8" +) + +// IsQuestion returns true if a query is a question +func IsQuestion(q string) bool { + intent := ClassifyIntent(q) + switch intent { // nolint:exhaustive + case IntentPrice, IntentContact, IntentOther: + return false + default: + return true + } +} + +// ClassifyIntent returns the most likely intent for the given query +func ClassifyIntent(q string) Intent { + q = normalize(q) + if q == "" { + return IntentOther + } + type scored struct { + intent Intent + score, prio int + } + var candidates []scored + + for _, spec := range intentTable { + score := 0 + for _, r := range spec.Rules { + if matchByType(q, r) { + score += r.Weight + } + } + if score != 0 { + candidates = append(candidates, scored{spec.Intent, score, spec.Priority}) + } + } + if len(candidates) == 0 { + return IntentOther + } + + sort.Slice(candidates, func(i, j int) bool { + if candidates[i].score == candidates[j].score { + return candidates[i].prio > candidates[j].prio + } + return candidates[i].score > candidates[j].score + }) + return candidates[0].intent +} + +func matchByType(q string, r Rule) bool { + switch r.MatchType { + case MatchTypeContains: + for _, t := range r.Terms { + if strings.Contains(q, t) { + return true + } + } + case MatchTypeStarts: + for _, t := range r.Terms { + if strings.HasPrefix(q, t) { + return true + } + } + case MatchTypeEnds: + for _, t := range r.Terms { + if strings.HasSuffix(q, t) { + return true + } + } + case MatchTypeTokenSuffix: + minLen := r.MinTokenLen + if minLen <= 0 { + minLen = 4 + } + for _, tok := range tokenize(q) { + if len(tok) < minLen { + continue + } + for _, suf := range r.Terms { + if strings.HasSuffix(tok, suf) { + return true + } + } + } + } + + return false +} + +func normalize(s string) string { + s = strings.ToLower(strings.TrimSpace(collapseSpaces(s))) + s = " " + s + " " + s = expandAbbreviations(s) + return strings.TrimSpace(collapseSpaces(s)) +} + +func collapseSpaces(s string) string { + var b strings.Builder + sp := false + for _, r := range s { + if unicode.IsSpace(r) { + if !sp { + b.WriteByte(' ') + sp = true + } + } else { + b.WriteRune(r) + sp = false + } + } + return strings.TrimSpace(b.String()) +} + +// normalize abbreviations anywhere (start/mid/end) +func expandAbbreviations(s string) string { + words := strings.Fields(s) + for i, w := range words { + if repl, ok := abbrevMap[w]; ok { + words[i] = repl + continue + } + // handle punctuation like "knp?" or "dmn," etc. + base := strings.TrimRight(w, "?.!,") + suffix := w[len(base):] + if repl, ok := abbrevMap[base]; ok { + words[i] = repl + suffix + } + } + return strings.Join(words, " ") +} + +// tokenize splits on whitespace and trims leading/trailing non-letters/digits per token. +// keeps tokens simple & fast (no regex). +func tokenize(s string) []string { + raw := strings.Fields(s) + out := make([]string, 0, len(raw)) + for _, t := range raw { + t = trimNonAlphaNum(t) + if t != "" { + out = append(out, t) + } + } + return out +} + +func trimNonAlphaNum(s string) string { + start, end := 0, len(s) + for start < end { + r := rune(s[start]) + if isAlphaNum(r) { + break + } + _, w := utf8.DecodeRuneInString(s[start:]) + start += w + } + for end > start { + r, w := utf8.DecodeLastRuneInString(s[:end]) + if isAlphaNum(r) { + break + } + end -= w + } + return s[start:end] +} + +func isAlphaNum(r rune) bool { return unicode.IsLetter(r) || unicode.IsDigit(r) } diff --git a/tanya/tanya_test.go b/tanya/tanya_test.go new file mode 100644 index 0000000..59a6eb3 --- /dev/null +++ b/tanya/tanya_test.go @@ -0,0 +1,190 @@ +package tanya + +import ( + "strings" + "testing" +) + +func TestIsQuestion(t *testing.T) { + t.Parallel() + + cases := []struct { + q string + want bool + }{ + // --- explicit / canonical questions + {"apa itu knowledge graph", true}, + {"bagaimana cara reset password gmail", true}, + {"perbedaan redux vs zustand", true}, + {"kapan sidang mk hari ini", true}, + {"si andi pergi kemana ya", true}, + {"kok servernya error pas deploy", true}, + {"ya nggak sih performanya drop", true}, + {"jelasin cara mukbang", true}, + {"update kematian mahasiswa unud", true}, + + // --- abbreviations / slang normalization (start/mid/end + punct) + {"gmn cara scrape instagram", true}, // gmn -> gimana + {"gmna cara beli tiket", true}, // gmna -> gimana + {"bgmn cara install docker", true}, // bgmn -> bagaimana + {"knp server down semalem", true}, // knp -> kenapa + {"knpa servernya lambat", true}, // knpa -> kenapa + {"dmn lokasi konser", true}, // dmn -> dimana + {"dmna lokasi vaksin", true}, // dmna -> dimana + {"ini ada di dimn", true}, // dimn -> dimana + {"kmn mau makan siang?", true}, // kmn -> kemana + {"kita kmna abis ini", true}, // kmna -> kemana + {"brp harga langganan", false}, // brp -> berapa -> price => non-question + {"kpn rilis update?", true}, // kpn -> kapan + {"kpan meetingnya", true}, // kpan -> kapan + {"sapa yang ikut", true}, // sapa -> siapa + {"sp aja yang hadir", true}, // sp -> siapa + {"knp?", true}, // knp at start + punctuation + {"server down dmn,", true}, // trailing punctuation handled + + // --- particles at the end (colloquial endings) + {"mau makan kemana siang ini", true}, + {"dia tadi ke kantor dimana", true}, + {"ini kenapa ya", true}, + {"ini apa sih", true}, + {"performanya turun ya kan", true}, + + // --- -kah suffix (via token_suffix) including punctuation + {"bisakah presiden diganti", true}, + {"mungkinkah ini berhasil", true}, + {"adakah solusi cepatnya", true}, + {"mungkinkah ini berhasil!!!", true}, + {"akah", false}, // too short to be meaningful (guard by MinTokenLen) + + // --- how-to variants + {"cara deploy ke production docker", true}, + {"bagaimana cara memperbaiki error 500", true}, + {"cara cepat push ke github ", true}, // extra spaces + {"cara setting oauth di https://example.com/docs", true}, + {"resep bubur bayi 6 bulan", true}, + {"resep mpasi tanpa gula garam", true}, + {"menu mpasi 6 bulan", true}, + {"ide mpasi murah meriah", true}, + {"tutorial docker", true}, + {"panduan upgrade postgres", true}, + + // --- comparison signals + {"bagusan mana mirrorless atau dslr", true}, + {"A vs B untuk data pipeline", true}, + {"versus airflow vs dagster", true}, + {"pilih mana A atau B", true}, + {"lebih bagus mana iphone atau pixel", true}, + + // --- definition / explain variants + {"apa arti resilien", true}, + {"apa maksud zero copy", true}, + {"explain RAG pls", true}, + {"penjelasan implementasi RAG", true}, + + // --- update / time / location intent + {"terkini erupsi bromo", true}, + {"perkembangan kasus x sekarang", true}, + {"lokasi kantor jakarta selatan", true}, // location -> question-like + {"jadwal konser jakarta", true}, // time -> question-like + + // --- “yang mana” (keep as question), but “mana store” should not + {"yang mana yang benar", true}, + {"ini pilih yang mana", true}, + {"mana store", false}, // 'mana' as noun chunk; intended info/browse + + // --- punctuation / emoji / casing + {"KENAPA SERVER LEMOT", true}, + {"Kenapa server lemot?", true}, + {"kenapa server lemot 🤔", true}, + {" Bagaimana Cara Reset Password ", true}, + + // --- tricky “vs” that is not comparison (product name) + {"vs code extensions", false}, // treat 'vs' here as product word, not comparison + + // --- obvious non-questions + {"toyota", false}, + {"jakarta", false}, + {"harga paket premium", false}, + {"kontak cs kumparan", false}, + {"download aplikasi android", false}, + {"grab promo kupon", false}, + {"", false}, + {" \t ", false}, + } + + for _, tc := range cases { + tc := tc + name := tc.q + if strings.TrimSpace(name) == "" { + name = "" + } + t.Run(name, func(t *testing.T) { + t.Parallel() + got := IsQuestion(tc.q) + if got != tc.want { + t.Fatalf("IsQuestion(%q) = %v, want %v", tc.q, got, tc.want) + } + }) + } +} + +func TestClassifyIntent(t *testing.T) { + t.Parallel() + + cases := []struct { + q string + want Intent + }{ + {"update kematian mahasiswa unud", IntentUpdate}, + {"jelasin cara mukbang", IntentExplain}, + {"arti overfitting", IntentExplain}, + {"bagaimana cara reset password gmail", IntentHowTo}, + {"apa itu knowledge graph", IntentDefinition}, + {"perbedaan redux vs zustand", IntentComparison}, + {"rekomendasi laptop 10 jutaan untuk desain", IntentRecommendation}, + {"kok servernya error pas deploy", IntentTroubleshoot}, + {"alamat kantor kumparan dimana ya", IntentLocation}, + {"mau makan kemana siang ini", IntentLocation}, + {"kapan jadwal konser hari ini", IntentTime}, + {"berapa harga paket premium", IntentPrice}, + {"kontak cs atau nomor wa resmi", IntentContact}, + {"toyota", IntentOther}, + {"download aplikasi android", IntentOther}, + + // mixed signals + {"update berita gempa vs banjir hari ini", IntentUpdate}, + {"apa sih lebih bagus mana A vs B", IntentQuestion}, + {"gmn cara beli tiket konser", IntentHowTo}, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.q, func(t *testing.T) { + t.Parallel() + got := ClassifyIntent(tc.q) + if got != tc.want { + t.Fatalf("ClassifyIntent(%q) = %s, want one of %v", tc.q, got, tc.want) + } + }) + } +} + +func BenchmarkIsQuestion(b *testing.B) { + queries := []string{ + "apa itu knowledge graph", + "bagaimana cara reset password gmail", + "update kematian mahasiswa unud", + "jelasin cara mukbang", + "perbedaan redux vs zustand", + "toyota", + "harga paket premium", + "kontak cs kumparan", + "download aplikasi android", + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, q := range queries { + _ = IsQuestion(q) + } + } +}