firesidetech/best-practices.html at main · JasperNoBoxDev/firesidetech · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
<!DOCTYPE html><html lang="en" class="dark">
<!-- Mirrored from guide.gpt-trainer.com/best-practices by HTTrack Website Copier/3.x [XR&CO'2014], Tue, 07 Jan 2025 14:53:24 GMT -->
<!-- Added by HTTrack --><meta http-equiv="content-type" content="text/html;charset=utf-8" /><!-- /Added by HTTrack -->
<head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/apple-touch-icon30f4.png?v=3"/><link rel="icon" type="image/png" sizes="32x32" href="../mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/favicon-32x3230f4.png?v=3"/><link rel="icon" type="image/png" sizes="16x16" href="../mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/favicon-16x1630f4.png?v=3"/><link rel="shortcut icon" type="image/x-icon" href="https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/favicon.ico?v=3"/><meta name="msapplication-config" content="https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/browserconfig.xml?v=3"/><meta name="apple-mobile-web-app-title" content="GPT-trainer API"/><meta name="application-name" content="GPT-trainer API"/><meta name="msapplication-TileColor" content="#2E3F51"/><meta name="theme-color" content="#ffffff"/><link rel="sitemap" type="application/xml" href="sitemap.xml"/><meta name="charset" content="utf-8"/><meta name="og:type" content="website"/><meta name="og:site_name" content="GPT-trainer API"/><meta name="twitter:card" content="summary_large_image"/><meta name="og:title" content="Best practices for preparing training data - GPT-trainer API"/><meta name="twitter:title" content="Best practices for preparing training data - GPT-trainer API"/><meta name="og:image" content="https://mintlify.com/docs/api/og?division=Documentation&amp;title=Best+practices+for+preparing+training+data&amp;logoLight=https%3A%2F%2Fmintlify.s3.us-west-1.amazonaws.com%2Fpaladinmaxinc%2Flogo%2Flight.svg&amp;logoDark=https%3A%2F%2Fmintlify.s3.us-west-1.amazonaws.com%2Fpaladinmaxinc%2Flogo%2Fdark.svg&amp;primaryColor=%232E3F51&amp;lightColor=%23516F90&amp;darkColor=%230D001D"/><meta name="twitter:image" content="https://mintlify.com/docs/api/og?division=Documentation&amp;title=Best+practices+for+preparing+training+data&amp;logoLight=https%3A%2F%2Fmintlify.s3.us-west-1.amazonaws.com%2Fpaladinmaxinc%2Flogo%2Flight.svg&amp;logoDark=https%3A%2F%2Fmintlify.s3.us-west-1.amazonaws.com%2Fpaladinmaxinc%2Flogo%2Fdark.svg&amp;primaryColor=%232E3F51&amp;lightColor=%23516F90&amp;darkColor=%230D001D"/><title>Best practices for preparing training data - GPT-trainer API</title><meta name="og:url" content="/best-practices"/><link rel="canonical" href="best-practices.html"/><meta name="next-head-count" content="23"/><link rel="stylesheet" href="../cdn.jsdelivr.net/npm/katex%400.16.0/dist/katex.min.css" integrity="sha384-Xi8rHCmBmhbuyyhbI88391ZKP2dmfnOl4rT9ZfRI7mLTdk1wblIUnrIq35nqwEvC" crossorigin="anonymous"/><link rel="preload" href="_next/static/media/a34f9d1faa5f3315-s.p.woff2" as="font" type="font/woff2" crossorigin="anonymous" data-next-font="size-adjust"/><link rel="preload" href="_next/static/media/bb3ef058b751a6ad-s.p.woff2" as="font" type="font/woff2" crossorigin="anonymous" data-next-font="size-adjust"/><script id="mode-toggle" data-nscript="beforeInteractive">
      try {
        if (localStorage.isDarkMode === 'true') {
          document.documentElement.classList.add('dark');
        } else if (localStorage.isDarkMode === 'false') {
          document.documentElement.classList.remove('dark');
        } else if ((true && !('isDarkMode' in localStorage) && window.matchMedia('(prefers-color-scheme: dark)').matches) || false) {
          document.documentElement.classList.add('dark');
        } else {
          document.documentElement.classList.remove('dark');
        }
      } catch (_) {}
    </script><link rel="preload" href="_next/static/css/16035c2adeba2fd7.css" as="style"/><link rel="stylesheet" href="_next/static/css/16035c2adeba2fd7.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="_next/static/chunks/polyfills-42372ed130431b0a.js"></script><script src="_next/static/chunks/webpack-99a660de06a74703.js" defer=""></script><script src="_next/static/chunks/framework-44a6e5dc2ffde502.js" defer=""></script><script src="_next/static/chunks/main-6f86f9a153903fae.js" defer=""></script><script src="_next/static/chunks/pages/_app-1767ed3009913161.js" defer=""></script><script src="_next/static/chunks/2edb282b-7fa355f49eaeb230.js" defer=""></script><script src="_next/static/chunks/e893f787-54a006ae51267903.js" defer=""></script><script src="_next/static/chunks/086d643d-3aa1cf46914548d9.js" defer=""></script><script src="_next/static/chunks/9097-75a5bfb192203d09.js" defer=""></script><script src="_next/static/chunks/7669-2cdcbf7436d2d1bd.js" defer=""></script><script src="_next/static/chunks/5339-37d37f3a0f878abf.js" defer=""></script><script src="_next/static/chunks/4922-ba19e45713cda605.js" defer=""></script><script src="_next/static/chunks/pages/_sites/%5bsubdomain%5d/%5b%5b...slug%5d%5d-7f2259a5793aeffc.js" defer=""></script><script src="_next/static/pChs_9tFT1YAEINLWWPhQ/_buildManifest.js" defer=""></script><script src="_next/static/pChs_9tFT1YAEINLWWPhQ/_ssgManifest.js" defer=""></script><style id="__jsx-4145347147">:root{--font-inter:'__Inter_e5ab12', '__Inter_Fallback_e5ab12';--font-jetbrains-mono:'__JetBrains_Mono_3c557b', '__JetBrains_Mono_Fallback_3c557b'}</style></head><div id="__next"><main class="jsx-4145347147"><style>:root {
    --primary: 46 63 81;
    --primary-light: 81 111 144;
    --primary-dark: 13 0 29;
    --background-light: 255 255 255;
    --background-dark: 17 24 39;
    --gray-50: 244 244 245;
    --gray-100: 239 239 240;
    --gray-200: 223 224 224;
    --gray-300: 207 207 208;
    --gray-400: 159 160 160;
    --gray-500: 113 113 114;
    --gray-600: 81 81 82;
    --gray-700: 63 64 65;
    --gray-800: 38 39 39;
    --gray-900: 24 24 25;
    --gray-950: 11 12 12;
  }</style><div class="relative antialiased text-gray-500 dark:text-gray-400"><span class="fixed inset-0 bg-background-light dark:bg-background-dark -z-10"></span><div id="navbar" class="z-30 fixed lg:sticky top-0 w-full"><div id="navbar-transition" class="absolute w-full h-full backdrop-blur flex-none transition-colors duration-500 border-b border-gray-500/5 dark:border-gray-300/[0.06] supports-backdrop-blur:bg-background-light/60 dark:bg-transparent"></div><div class="max-w-8xl mx-auto relative"><div class=""><div class="relative"><div class="flex items-center lg:px-12 h-16 min-w-0 px-4"><div class="h-full relative flex-1 flex items-center gap-x-4 min-w-0 border-b border-gray-500/5 dark:border-gray-300/[0.06] lg:border-none"><div class="flex-1 flex items-center gap-x-4"><a href="index.html"><span class="sr-only">GPT-trainer API<!-- --> home page</span><img class="w-auto h-7 relative object-contain block dark:hidden" src="https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/light.svg" alt="light logo"/><img class="w-auto h-7 relative object-contain hidden dark:block" src="https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/dark.svg" alt="dark logo"/></a><div class="flex items-center gap-x-2"></div></div><div class="hidden lg:block mx-px relative flex-1 bg-white dark:bg-gray-900 pointer-events-auto rounded-xl min-w-0"><button type="button" class="w-full flex items-center text-sm leading-6 rounded-xl py-1.5 pl-3.5 pr-3 shadow-sm text-gray-400 dark:text-white/50 bg-background-light dark:bg-background-dark dark:brightness-[1.1] dark:ring-1 dark:hover:brightness-[1.25] ring-1 ring-gray-400/20 hover:ring-gray-600/25 dark:ring-gray-600/30 dark:hover:ring-gray-500/30 focus:outline-primary justify-between truncate gap-2 min-w-[43px]" id="search-bar-entry"><div class="flex items-center gap-3 min-w-[42px]"><svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-search min-w-4 flex-none text-gray-700 hover:text-gray-800 dark:text-gray-300 hover:dark:text-gray-200"><circle cx="11" cy="11" r="8"></circle><path d="m21 21-4.3-4.3"></path></svg><div class="truncate min-w-0">Search...</div></div></button></div><div class="flex-1 relative hidden lg:flex items-center ml-auto justify-end space-x-4"><nav class="text-sm"><ul class="flex space-x-6 items-center"><li><a href="mailto:hello@gpt-trainer.com" class="whitespace-nowrap font-medium text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" target="_blank">Support</a></li></ul></nav><div class="flex items-center"><button class="group p-2 flex items-center justify-center" aria-label="Toggle dark mode"><svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" xmlns="http://www.w3.org/2000/svg" class="h-4 w-4 block text-gray-400 dark:hidden group-hover:text-gray-600"><g clip-path="url(#clip0_2880_7340)"><path d="M8 1.11133V2.00022" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M12.8711 3.12891L12.2427 3.75735" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M14.8889 8H14" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M12.8711 12.8711L12.2427 12.2427" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M8 14.8889V14" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M3.12891 12.8711L3.75735 12.2427" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M1.11133 8H2.00022" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M3.12891 3.12891L3.75735 3.75735" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><path d="M8.00043 11.7782C10.0868 11.7782 11.7782 10.0868 11.7782 8.00043C11.7782 5.91402 10.0868 4.22266 8.00043 4.22266C5.91402 4.22266 4.22266 5.91402 4.22266 8.00043C4.22266 10.0868 5.91402 11.7782 8.00043 11.7782Z" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></g><defs><clipPath id="clip0_2880_7340"><rect width="16" height="16" fill="white"></rect></clipPath></defs></svg><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-moon h-4 w-4 hidden dark:block text-gray-500 dark:group-hover:text-gray-300"><path d="M12 3a6 6 0 0 0 9 9 9 9 0 1 1-9-9Z"></path></svg></button></div></div><div class="flex lg:hidden items-center gap-2"><button type="button" class="text-gray-500 w-8 h-8 flex items-center justify-center hover:text-gray-600 dark:text-gray-400 dark:hover:text-gray-300" id="search-bar-entry-mobile"><span class="sr-only">Search...</span><svg class="h-4 w-4 bg-gray-500 dark:bg-gray-400 hover:bg-gray-600 dark:hover:bg-gray-300" style="-webkit-mask-image:url(https://mintlify.b-cdn.net/v6.6.0/solid/magnifying-glass.svg);-webkit-mask-repeat:no-repeat;-webkit-mask-position:center"></svg></button><button aria-label="More actions" class="h-7 w-5 flex items-center justify-end"><svg class="h-4 w-4 bg-gray-500 dark:bg-gray-400 hover:bg-gray-600 dark:hover:bg-gray-300" style="-webkit-mask-image:url(https://mintlify.b-cdn.net/v6.6.0/solid/ellipsis-vertical.svg);-webkit-mask-repeat:no-repeat;-webkit-mask-position:center"></svg></button></div></div></div><div class="flex items-center h-14 py-4 px-5 lg:hidden"><button type="button" class="text-gray-500 hover:text-gray-600 dark:text-gray-400 dark:hover:text-gray-300"><span class="sr-only">Navigation</span><svg class="h-4" fill="currentColor" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M0 96C0 78.3 14.3 64 32 64H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32C14.3 128 0 113.7 0 96zM0 256c0-17.7 14.3-32 32-32H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32c-17.7 0-32-14.3-32-32zM448 416c0 17.7-14.3 32-32 32H32c-17.7 0-32-14.3-32-32s14.3-32 32-32H416c17.7 0 32 14.3 32 32z"></path></svg></button><div class="ml-4 flex text-sm leading-6 whitespace-nowrap min-w-0 space-x-3"><div class="flex items-center space-x-3"><span>Guides</span><svg width="3" height="24" viewBox="0 -9 3 24" class="h-5 rotate-0 overflow-visible fill-gray-400"><path d="M0 0L3 3L0 6" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round"></path></svg></div><div class="font-semibold text-gray-900 truncate dark:text-gray-200">Best practices for preparing training data</div></div></div></div></div></div></div><div class="max-w-8xl px-4 mx-auto lg:px-8 min-h-screen"><div class="z-20 hidden lg:block fixed bottom-0 right-auto w-[18rem] top-[4rem]" id="sidebar"><div class="absolute inset-0 z-10 stable-scrollbar-gutter overflow-auto pr-8 pb-10" id="sidebar-content"><div class="relative lg:text-sm lg:leading-6"><div class="sticky top-0 h-8 bg-gradient-to-b from-background-light dark:from-background-dark"></div><div id="navigation-items"><li class="list-none"><a class="pl-4 group flex items-center lg:text-sm lg:leading-6 mb-5 sm:mb-4 font-semibold text-primary dark:text-primary-light" href="introduction.html"><div style="background:linear-gradient(45deg, #ED727B, #F6B7BB)" class="mr-4 rounded-md p-1"><svg class="h-4 w-4 secondary-opacity group-hover:fill-primary-dark group-hover:bg-white bg-white" style="-webkit-mask-image:url(https://mintlify.b-cdn.net/v6.6.0/duotone/book-open.svg);-webkit-mask-repeat:no-repeat;-webkit-mask-position:center"></svg></div>Documentation</a></li><li class="list-none"><a class="pl-4 group flex items-center lg:text-sm lg:leading-6 mb-5 sm:mb-4 font-medium text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" href="api-reference/api-key-setup.html"><div class="mr-4 rounded-md p-1 zinc-box group-hover:brightness-100 group-hover:ring-0 ring-1 ring-gray-950/5 dark:ring-gray-700/40"><svg class="h-4 w-4 secondary-opacity group-hover:fill-primary-dark group-hover:bg-white bg-gray-400 dark:bg-gray-500" style="-webkit-mask-image:url(https://mintlify.b-cdn.net/v6.6.0/duotone/code.svg);-webkit-mask-repeat:no-repeat;-webkit-mask-position:center"></svg></div>API References</a></li><li class="list-none"><a class="pl-4 group flex items-center lg:text-sm lg:leading-6 mb-5 sm:mb-4 font-medium text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" href="tools/tools-intro.html"><div class="mr-4 rounded-md p-1 zinc-box group-hover:brightness-100 group-hover:ring-0 ring-1 ring-gray-950/5 dark:ring-gray-700/40"><svg class="h-4 w-4 secondary-opacity group-hover:fill-primary-dark group-hover:bg-white bg-gray-400 dark:bg-gray-500" style="-webkit-mask-image:url(https://mintlify.b-cdn.net/v6.6.0/duotone/gear.svg);-webkit-mask-repeat:no-repeat;-webkit-mask-position:center"></svg></div>Tools</a></li><li class="list-none"><a class="pl-4 group flex items-center lg:text-sm lg:leading-6 mb-5 sm:mb-4 font-medium text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" href="whitelabel/whitelabel-intro.html"><div class="mr-4 rounded-md p-1 zinc-box group-hover:brightness-100 group-hover:ring-0 ring-1 ring-gray-950/5 dark:ring-gray-700/40"><svg class="h-4 w-4 secondary-opacity group-hover:fill-primary-dark group-hover:bg-white bg-gray-400 dark:bg-gray-500" style="-webkit-mask-image:url(https://mintlify.b-cdn.net/v6.6.0/duotone/browser.svg);-webkit-mask-repeat:no-repeat;-webkit-mask-position:center"></svg></div>Whitelabel</a></li><div class="mt-12 lg:mt-8"><h5 class="pl-4 mb-3.5 lg:mb-2.5 font-semibold text-gray-900 dark:text-gray-200">Getting Started</h5><ul><li id="/introduction" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="introduction.html"><div class="flex-1 flex items-center space-x-2.5"><div>Introduction</div></div></a></li></ul></div><div class="mt-12 lg:mt-8"><h5 class="pl-4 mb-3.5 lg:mb-2.5 font-semibold text-gray-900 dark:text-gray-200">Guides</h5><ul><li id="/creating-first-chatbot" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="creating-first-chatbot.html"><div class="flex-1 flex items-center space-x-2.5"><div>Create Your First Chatbot</div></div></a></li><li id="/lead-collection" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="lead-collection.html"><div class="flex-1 flex items-center space-x-2.5"><div>Lead Collection</div></div></a></li><li id="/human-support-escalation" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="human-support-escalation.html"><div class="flex-1 flex items-center space-x-2.5"><div>Human Support Escalation</div></div></a></li><li id="/inbox-notifications" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="inbox-notifications.html"><div class="flex-1 flex items-center space-x-2.5"><div>Inbox Notifications</div></div></a></li><li id="/conversation-labeling" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="conversation-labeling.html"><div class="flex-1 flex items-center space-x-2.5"><div>Conversation Labeling</div></div></a></li><li id="/multi-agents-chatbot" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="multi-agents-chatbot.html"><div class="flex-1 flex items-center space-x-2.5"><div>Multi-Agents Chatbot</div></div></a></li><li id="/fine-tuning-agent-intents" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="fine-tuning-agent-intents.html"><div class="flex-1 flex items-center space-x-2.5"><div>Fine Tuning Agent Intents</div></div></a></li><li id="/supervisor-overrides" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="supervisor-overrides.html"><div class="flex-1 flex items-center space-x-2.5"><div>AI Supervisor Overrides</div></div></a></li><li id="/byok-pricing-guide" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="byok-pricing-guide.html"><div class="flex-1 flex items-center space-x-2.5"><div>Bring Your Own Key (BYOK) and Pricing</div></div></a></li><li id="/working-with-tables" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="working-with-tables.html"><div class="flex-1 flex items-center space-x-2.5"><div>Working with Tables and CSV</div></div></a></li><li id="/best-practices" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl bg-primary/10 text-primary font-semibold dark:text-primary-light dark:bg-primary-light/10" style="padding-left:1rem" href="best-practices.html"><div class="flex-1 flex items-center space-x-2.5"><div>Best practices for preparing training data</div></div></a></li><li id="/help" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="help.html"><div class="flex-1 flex items-center space-x-2.5"><div>Why does my chatbot not answer correctly?</div></div></a></li></ul></div><div class="mt-12 lg:mt-8"><h5 class="pl-4 mb-3.5 lg:mb-2.5 font-semibold text-gray-900 dark:text-gray-200">Function Calling</h5><ul><li id="/rag-from-external-data-provider" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="rag-from-external-data-provider.html"><div class="flex-1 flex items-center space-x-2.5"><div>RAG from an External Data Provider</div></div></a></li></ul></div><div class="mt-12 lg:mt-8"><h5 class="pl-4 mb-3.5 lg:mb-2.5 font-semibold text-gray-900 dark:text-gray-200">Authentication Webhook</h5><ul><li id="/user-identity" class="scroll-m-4 first:scroll-m-20"><a class="group mt-2 lg:mt-0 flex items-center pr-3 py-1.5 cursor-pointer focus:outline-primary dark:focus:outline-primary-light space-x-3 rounded-xl hover:bg-gray-600/5 dark:hover:bg-gray-200/5 text-gray-700 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300" style="padding-left:1rem" href="user-identity.html"><div class="flex-1 flex items-center space-x-2.5"><div>User Identity Verification</div></div></a></li></ul></div></div></div></div></div><div class="" id="content-container"><div class="flex flex-row gap-12 box-border w-full pt-40 lg:pt-10"><div class="relative grow box-border flex-col w-full mx-auto px-1 lg:pl-[23.7rem] lg:-ml-12 xl:w-[calc(100%-28rem)]" id="content-area"><header id="header" class="relative"><div class="mt-0.5 space-y-2.5"><div class="eyebrow h-5 text-primary dark:text-primary-light text-sm font-semibold">Guides</div><div class="flex items-center"><h1 class="inline-block text-2xl sm:text-3xl font-extrabold text-gray-900 tracking-tight dark:text-gray-200">Best practices for preparing training data</h1></div></div></header><div class="flex flex-col gap-8"><div class="flex flex-col gap-6 xl:hidden [&amp;:not(:empty)]:mt-8"></div></div><div class="relative mt-8 prose prose-gray dark:prose-invert"><p>The accuracy and consistency of your chatbot depends on a number of factors:</p>
<ul>
<li>Quality of your training data</li>
<li>Large language model (LLM) selection</li>
<li>Explicitness of base prompt</li>
</ul>
<p>LLMs, like all statistics-based models, require training data during their construction. As they often say in the AI research community, “your model is only as good as your training data”.
The best way to dictate and optimize your chatbot’s performance is to clean up its training data. In the following section, we provide some best practices for structuring your training data.</p>
<p>LLMs do not “think” like humans do. They interpret and process data very differently from humans. To understand how the machine uses this data, we center our discussion on “chunks”.</p>
<h2 class="flex whitespace-pre-wrap group" id="chunk-splitting"><div class="absolute"><a href="#chunk-splitting" class="-ml-10 flex items-center opacity-0 border-0 group-hover:opacity-100" aria-label="Navigate to header"><div class="w-6 h-6 text-gray-400 rounded-md flex items-center justify-center zinc-box bg-white ring-1 ring-gray-400/30 dark:ring-gray-700/25 hover:ring-gray-400/60 dark:hover:ring-white/20"><svg xmlns="http://www.w3.org/2000/svg" fill="gray" height="12px" viewBox="0 0 576 512"><path d="M0 256C0 167.6 71.6 96 160 96h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C98.1 144 48 194.1 48 256s50.1 112 112 112h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C71.6 416 0 344.4 0 256zm576 0c0 88.4-71.6 160-160 160H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c61.9 0 112-50.1 112-112s-50.1-112-112-112H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c88.4 0 160 71.6 160 160zM184 232H392c13.3 0 24 10.7 24 24s-10.7 24-24 24H184c-13.3 0-24-10.7-24-24s10.7-24 24-24z"></path></svg></div></a></div><span class="cursor-pointer">Chunk splitting</span></h2>
<p>During RAG, chunks are selected and injected into the user’s original input query, along with the base prompt. These chunks are derived directly from your
uploaded training data - PDFs, Word, websites, TXT files, etc. Since LLMs have token limits, we must also enforce constraints on the size of these chunks.</p>
<p>This means that even if your original document has a long chapter of text that talks about a single topic, it will have to be divided into multiple chunks and stored separately within our vector database.</p>
<p>So how can we divide up the document with minimal alterations to its original meaning?</p>
<p>Unfortunately there is no universal solution. This is still an ongoing field of scientific research. GPT-trainer uses a combination of rule-based and statistical
relevance algorithms to divide training data into chunks, but we cannot always guarantee each chunk is self-contained, clean, and accurate. Fortunately, LLMs
specialize in working with unstructured text, and they have high tolerances for badly formatted input when producing responses.</p>
<h2 class="flex whitespace-pre-wrap group" id="chunk-quality"><div class="absolute"><a href="#chunk-quality" class="-ml-10 flex items-center opacity-0 border-0 group-hover:opacity-100" aria-label="Navigate to header"><div class="w-6 h-6 text-gray-400 rounded-md flex items-center justify-center zinc-box bg-white ring-1 ring-gray-400/30 dark:ring-gray-700/25 hover:ring-gray-400/60 dark:hover:ring-white/20"><svg xmlns="http://www.w3.org/2000/svg" fill="gray" height="12px" viewBox="0 0 576 512"><path d="M0 256C0 167.6 71.6 96 160 96h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C98.1 144 48 194.1 48 256s50.1 112 112 112h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C71.6 416 0 344.4 0 256zm576 0c0 88.4-71.6 160-160 160H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c61.9 0 112-50.1 112-112s-50.1-112-112-112H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c88.4 0 160 71.6 160 160zM184 232H392c13.3 0 24 10.7 24 24s-10.7 24-24 24H184c-13.3 0-24-10.7-24-24s10.7-24 24-24z"></path></svg></div></a></div><span class="cursor-pointer">Chunk quality</span></h2>
<p>Another source of error comes from the chunk content itself. Optimally, each chunk should be self-contained, semantically self-consistent, and grammatically
correct. If document structure is important, each chunk should also have relevant metadata specifying where in the document it comes from. However,
none of this can be guaranteed when chunks are initially extracted from uploaded text.</p>
<p>This error is especially pronounced when working with websites. Since web browsers render websites very differently from how web scraper sees them, what
you see can be very different from what our scraper captures. Furthermore, most layout information and data residing in images / illustrations / videos are lost during the scraping process.</p>
<div><div class="p-2 not-prose relative bg-gray-50/50 rounded-2xl overflow-hidden dark:bg-gray-800/25"><div style="background-position:10px 10px" class="absolute inset-0 bg-grid-neutral-200/20 [mask-image:linear-gradient(0deg,#fff,rgba(255,255,255,0.6))] dark:bg-grid-white/5 dark:[mask-image:linear-gradient(0deg,rgba(255,255,255,0.1),rgba(255,255,255,0.5))]"></div><div class="relative rounded-xl overflow-hidden flex justify-center"></div><div class="absolute inset-0 pointer-events-none border border-black/5 rounded-2xl dark:border-white/5"></div></div></div>
<p><em>GPT-trainer’s own pricing table on <a href="https://gpt-trainer.com/pricing" target="_blank" rel="noreferrer">https://gpt-trainer.com/pricing</a> as rendered via the Chrome browser.</em></p>
<div><div class="p-2 not-prose relative bg-gray-50/50 rounded-2xl overflow-hidden dark:bg-gray-800/25"><div style="background-position:10px 10px" class="absolute inset-0 bg-grid-neutral-200/20 [mask-image:linear-gradient(0deg,#fff,rgba(255,255,255,0.6))] dark:bg-grid-white/5 dark:[mask-image:linear-gradient(0deg,rgba(255,255,255,0.1),rgba(255,255,255,0.5))]"></div><div class="relative rounded-xl overflow-hidden flex justify-center"></div><div class="absolute inset-0 pointer-events-none border border-black/5 rounded-2xl dark:border-white/5"></div></div></div>
<p><em>The same website content, after our scraper captures it and associated chunking has been done.</em></p>
<h2 class="flex whitespace-pre-wrap group" id="no-gaps-no-overlaps"><div class="absolute"><a href="#no-gaps-no-overlaps" class="-ml-10 flex items-center opacity-0 border-0 group-hover:opacity-100" aria-label="Navigate to header"><div class="w-6 h-6 text-gray-400 rounded-md flex items-center justify-center zinc-box bg-white ring-1 ring-gray-400/30 dark:ring-gray-700/25 hover:ring-gray-400/60 dark:hover:ring-white/20"><svg xmlns="http://www.w3.org/2000/svg" fill="gray" height="12px" viewBox="0 0 576 512"><path d="M0 256C0 167.6 71.6 96 160 96h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C98.1 144 48 194.1 48 256s50.1 112 112 112h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C71.6 416 0 344.4 0 256zm576 0c0 88.4-71.6 160-160 160H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c61.9 0 112-50.1 112-112s-50.1-112-112-112H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c88.4 0 160 71.6 160 160zM184 232H392c13.3 0 24 10.7 24 24s-10.7 24-24 24H184c-13.3 0-24-10.7-24-24s10.7-24 24-24z"></path></svg></div></a></div><span class="cursor-pointer">No gaps, no overlaps</span></h2>
<p>RAG relies on dynamically fetching a subset of reference data from the entire collection of training materials.
To identify which chunks contain the most relevant information, the user query goes through the same embedding process
as the chunks themselves. Then, we calculate a relevance score for every chunk based on the proximity of their embedding
vectors relative to the user’s input (cosine distance). Afterwards, the chunks are ranked, and our algorithm picks the top
n chunks that can fit into the reserved token window for the chosen LLM.</p>
<p>Since the algorithm tries to discover and fit as many relevant chunks as possible, there is the possibility that chunks
containing semantically similar, but factually inconsistent information are simultaneously injected into the reference context of the LLM call.</p>
<p>For example, if the user asks:</p>
<p><em><strong>What is the price of iPhone SE?</strong></em></p>
<p>then, the algorithm may pull the following chunks to serve as reference context:</p>
<blockquote>
<p>[Chunk 1] iPhone SE’s current price is $250.</p>
<p>[Chunk 2] Original iPhone SE is $199.</p>
<p>[Chunk 3] iPhone 5’s price is $600.</p>
</blockquote>
<p>As you can see, these chunks all explicitly mention the price of iPhone SE, so they are semantically similar to the user’s original query.
However, they contain factually inconsistent information. When this happens, you may notice the AI generating different responses each
time even if the same question was asked.</p>
<p>To ensure better consistency, we recommend that you adopt a “MECE” approach when uploading your training data. MECE stands for Mutually
Exclusive, Collectively Exhaustive. In other words - no gaps, no overlaps. If your training data is structured in this way, then you minimize
the chances of conflicting information being fed to the LLM during RAG, thereby ensuring that your chatbot behaves in a more predictable and intended fashion.</p>
<h2 class="flex whitespace-pre-wrap group" id="remove-unnecessary-training-data"><div class="absolute"><a href="#remove-unnecessary-training-data" class="-ml-10 flex items-center opacity-0 border-0 group-hover:opacity-100" aria-label="Navigate to header"><div class="w-6 h-6 text-gray-400 rounded-md flex items-center justify-center zinc-box bg-white ring-1 ring-gray-400/30 dark:ring-gray-700/25 hover:ring-gray-400/60 dark:hover:ring-white/20"><svg xmlns="http://www.w3.org/2000/svg" fill="gray" height="12px" viewBox="0 0 576 512"><path d="M0 256C0 167.6 71.6 96 160 96h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C98.1 144 48 194.1 48 256s50.1 112 112 112h72c13.3 0 24 10.7 24 24s-10.7 24-24 24H160C71.6 416 0 344.4 0 256zm576 0c0 88.4-71.6 160-160 160H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c61.9 0 112-50.1 112-112s-50.1-112-112-112H344c-13.3 0-24-10.7-24-24s10.7-24 24-24h72c88.4 0 160 71.6 160 160zM184 232H392c13.3 0 24 10.7 24 24s-10.7 24-24 24H184c-13.3 0-24-10.7-24-24s10.7-24 24-24z"></path></svg></div></a></div><span class="cursor-pointer">Remove unnecessary training data</span></h2>
<p>RAG works by matching semantically similar chunks to the user’s input query. Since LLMs have token restrictions, we can only
fit a limited number of chunks to serve as reference context. Therefore, if your overall knowledge base is large, then the percentage of
information that can be pulled into the LLM query each time is small.</p>
<p>For instance, if you have 20 chunks worth of training data and the LLM can pull in 10 chunks to serve as reference context each time, then
each user query can make use of 50% of the entire knowledge base. On the other hand, if you have 2000 chunks total, then each user query can o
nly pull 0.5% of the entire knowledge base.</p>
<p>Larger knowledge bases make it less likely for the RAG algorithm to identify relevant information. Rather than dumping everything in, having a
focused set of training data significantly improves the chatbot’s performance.</p></div><div class="leading-6 mt-14"><div class="mb-12 px-0.5 flex items-center text-sm font-semibold text-gray-700 dark:text-gray-200"><a class="flex items-center space-x-3 group" href="working-with-tables.html"><svg viewBox="0 0 3 6" class="h-1.5 stroke-gray-400 overflow-visible group-hover:stroke-gray-600 dark:group-hover:stroke-gray-300"><path d="M3 0L0 3L3 6" fill="none" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="group-hover:text-gray-900 dark:group-hover:text-white">Working with Tables and CSV</span></a><a class="flex items-center ml-auto space-x-3 group" href="help.html"><span class="group-hover:text-gray-900 dark:group-hover:text-white">Why does my chatbot not answer correctly?</span><svg viewBox="0 0 3 6" class="rotate-180 h-1.5 stroke-gray-400 overflow-visible group-hover:stroke-gray-600 dark:group-hover:stroke-gray-300"><path d="M3 0L0 3L3 6" fill="none" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"></path></svg></a></div><footer id="footer" class="flex gap-12 justify-between pt-10 border-t border-gray-100 sm:flex dark:border-gray-800/50 pb-28"><div class="flex items-center justify-between"><div class="sm:flex"><a href="https://mintlify.com/preview-request?utm_campaign=poweredBy&amp;utm_medium=docs&amp;utm_source=guide.gpt-trainer.com" target="_blank" rel="noreferrer" class="text-sm text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300">Powered by Mintlify</a></div></div></footer></div></div><div class="z-10 hidden xl:flex pl-10 box-border w-[19rem]" id="table-of-contents"><div id="table-of-contents-content" class="fixed text-gray-600 text-sm leading-6 w-[16.5rem] overflow-y-auto space-y-2 h-[calc(100%-7rem)]"><div class="text-gray-700 dark:text-gray-300 font-medium flex items-center space-x-2"><svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" stroke-width="2" xmlns="http://www.w3.org/2000/svg" class="h-3 w-3"><path d="M2.44434 12.6665H13.5554" stroke-linecap="round" stroke-linejoin="round"></path><path d="M2.44434 3.3335H13.5554" stroke-linecap="round" stroke-linejoin="round"></path><path d="M2.44434 8H7.33323" stroke-linecap="round" stroke-linejoin="round"></path></svg><span>On this page</span></div><ul><li><a href="#chunk-splitting" class="py-1 block hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300">Chunk splitting</a></li><li><a href="#chunk-quality" class="py-1 block hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300">Chunk quality</a></li><li><a href="#no-gaps-no-overlaps" class="py-1 block hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300">No gaps, no overlaps</a></li><li><a href="#remove-unnecessary-training-data" class="py-1 block hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-300">Remove unnecessary training data</a></li></ul></div></div></div></div></div></div></main></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"mdxSource":{"compiledSource":"\"use strict\";\nconst {Fragment: _Fragment, jsx: _jsx, jsxs: _jsxs} = arguments[0];\nconst {useMDXComponents: _provideComponents} = arguments[0];\nfunction _createMdxContent(props) {\n  const _components = {\n    a: \"a\",\n    blockquote: \"blockquote\",\n    em: \"em\",\n    li: \"li\",\n    p: \"p\",\n    strong: \"strong\",\n    ul: \"ul\",\n    ..._provideComponents(),\n    ...props.components\n  }, {Frame, Heading, ZoomImage} = _components;\n  if (!Frame) _missingMdxReference(\"Frame\", true);\n  if (!Heading) _missingMdxReference(\"Heading\", true);\n  if (!ZoomImage) _missingMdxReference(\"ZoomImage\", true);\n  return _jsxs(_Fragment, {\n    children: [_jsx(_components.p, {\n      children: \"The accuracy and consistency of your chatbot depends on a number of factors:\"\n    }), \"\\n\", _jsxs(_components.ul, {\n      children: [\"\\n\", _jsx(_components.li, {\n        children: \"Quality of your training data\"\n      }), \"\\n\", _jsx(_components.li, {\n        children: \"Large language model (LLM) selection\"\n      }), \"\\n\", _jsx(_components.li, {\n        children: \"Explicitness of base prompt\"\n      }), \"\\n\"]\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"LLMs, like all statistics-based models, require training data during their construction. As they often say in the AI research community, “your model is only as good as your training data”.\\nThe best way to dictate and optimize your chatbot’s performance is to clean up its training data. In the following section, we provide some best practices for structuring your training data.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"LLMs do not “think” like humans do. They interpret and process data very differently from humans. To understand how the machine uses this data, we center our discussion on “chunks”.\"\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"chunk-splitting\",\n      children: \"Chunk splitting\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"During RAG, chunks are selected and injected into the user’s original input query, along with the base prompt. These chunks are derived directly from your\\nuploaded training data - PDFs, Word, websites, TXT files, etc. Since LLMs have token limits, we must also enforce constraints on the size of these chunks.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"This means that even if your original document has a long chapter of text that talks about a single topic, it will have to be divided into multiple chunks and stored separately within our vector database.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"So how can we divide up the document with minimal alterations to its original meaning?\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Unfortunately there is no universal solution. This is still an ongoing field of scientific research. GPT-trainer uses a combination of rule-based and statistical\\nrelevance algorithms to divide training data into chunks, but we cannot always guarantee each chunk is self-contained, clean, and accurate. Fortunately, LLMs\\nspecialize in working with unstructured text, and they have high tolerances for badly formatted input when producing responses.\"\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"chunk-quality\",\n      children: \"Chunk quality\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Another source of error comes from the chunk content itself. Optimally, each chunk should be self-contained, semantically self-consistent, and grammatically\\ncorrect. If document structure is important, each chunk should also have relevant metadata specifying where in the document it comes from. However,\\nnone of this can be guaranteed when chunks are initially extracted from uploaded text.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"This error is especially pronounced when working with websites. Since web browsers render websites very differently from how web scraper sees them, what\\nyou see can be very different from what our scraper captures. Furthermore, most layout information and data residing in images / illustrations / videos are lost during the scraping process.\"\n    }), \"\\n\", _jsx(Frame, {\n      children: _jsx(ZoomImage, {\n        children: _jsx(\"img\", {\n          src: \"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/images/best-practices-1.png\"\n        })\n      })\n    }), \"\\n\", _jsx(_components.p, {\n      children: _jsxs(_components.em, {\n        children: [\"GPT-trainer’s own pricing table on \", _jsx(_components.a, {\n          href: \"https://gpt-trainer.com/pricing\",\n          children: \"https://gpt-trainer.com/pricing\"\n        }), \" as rendered via the Chrome browser.\"]\n      })\n    }), \"\\n\", _jsx(Frame, {\n      children: _jsx(ZoomImage, {\n        children: _jsx(\"img\", {\n          src: \"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/images/best-practices-2.png\"\n        })\n      })\n    }), \"\\n\", _jsx(_components.p, {\n      children: _jsx(_components.em, {\n        children: \"The same website content, after our scraper captures it and associated chunking has been done.\"\n      })\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"no-gaps-no-overlaps\",\n      children: \"No gaps, no overlaps\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"RAG relies on dynamically fetching a subset of reference data from the entire collection of training materials.\\nTo identify which chunks contain the most relevant information, the user query goes through the same embedding process\\nas the chunks themselves. Then, we calculate a relevance score for every chunk based on the proximity of their embedding\\nvectors relative to the user’s input (cosine distance). Afterwards, the chunks are ranked, and our algorithm picks the top\\nn chunks that can fit into the reserved token window for the chosen LLM.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Since the algorithm tries to discover and fit as many relevant chunks as possible, there is the possibility that chunks\\ncontaining semantically similar, but factually inconsistent information are simultaneously injected into the reference context of the LLM call.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"For example, if the user asks:\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: _jsx(_components.em, {\n        children: _jsx(_components.strong, {\n          children: \"What is the price of iPhone SE?\"\n        })\n      })\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"then, the algorithm may pull the following chunks to serve as reference context:\"\n    }), \"\\n\", _jsxs(_components.blockquote, {\n      children: [\"\\n\", _jsx(_components.p, {\n        children: \"[Chunk 1] iPhone SE’s current price is $250.\"\n      }), \"\\n\", _jsx(_components.p, {\n        children: \"[Chunk 2] Original iPhone SE is $199.\"\n      }), \"\\n\", _jsx(_components.p, {\n        children: \"[Chunk 3] iPhone 5’s price is $600.\"\n      }), \"\\n\"]\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"As you can see, these chunks all explicitly mention the price of iPhone SE, so they are semantically similar to the user’s original query.\\nHowever, they contain factually inconsistent information. When this happens, you may notice the AI generating different responses each\\ntime even if the same question was asked.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"To ensure better consistency, we recommend that you adopt a “MECE” approach when uploading your training data. MECE stands for Mutually\\nExclusive, Collectively Exhaustive. In other words - no gaps, no overlaps. If your training data is structured in this way, then you minimize\\nthe chances of conflicting information being fed to the LLM during RAG, thereby ensuring that your chatbot behaves in a more predictable and intended fashion.\"\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"remove-unnecessary-training-data\",\n      children: \"Remove unnecessary training data\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"RAG works by matching semantically similar chunks to the user’s input query. Since LLMs have token restrictions, we can only\\nfit a limited number of chunks to serve as reference context. Therefore, if your overall knowledge base is large, then the percentage of\\ninformation that can be pulled into the LLM query each time is small.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"For instance, if you have 20 chunks worth of training data and the LLM can pull in 10 chunks to serve as reference context each time, then\\neach user query can make use of 50% of the entire knowledge base. On the other hand, if you have 2000 chunks total, then each user query can o\\nnly pull 0.5% of the entire knowledge base.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Larger knowledge bases make it less likely for the RAG algorithm to identify relevant information. Rather than dumping everything in, having a\\nfocused set of training data significantly improves the chatbot’s performance.\"\n    })]\n  });\n}\nfunction MDXContent(props = {}) {\n  const {wrapper: MDXLayout} = {\n    ..._provideComponents(),\n    ...props.components\n  };\n  return MDXLayout ? _jsx(MDXLayout, {\n    ...props,\n    children: _jsx(_createMdxContent, {\n      ...props\n    })\n  }) : _createMdxContent(props);\n}\nreturn {\n  default: MDXContent\n};\nfunction _missingMdxReference(id, component) {\n  throw new Error(\"Expected \" + (component ? \"component\" : \"object\") + \" `\" + id + \"` to be defined: you likely forgot to import, pass, or provide it.\");\n}\n","frontmatter":{},"scope":{"mintConfig":{"$schema":"https://mintlify.com/schema.json","name":"GPT-trainer API","logo":{"light":"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/light.svg","dark":"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/dark.svg"},"favicon":"/logo/favicon.png","api":{"baseUrl":"https://app.gpt-trainer.com/api","auth":{"method":"bearer"}},"colors":{"primary":"#2E3F51","light":"#516F90","dark":"#0D001D","background":{"dark":"#111827"},"anchors":{"from":"#ED727B","to":"#F6B7BB"}},"topbarLinks":[{"url":"mailto:hello@gpt-trainer.com","name":"Support","_id":"676a0adaff1411a490c729a2"}],"navigation":[{"group":"Getting Started","pages":["introduction"]},{"group":"Guides","pages":["creating-first-chatbot","lead-collection","human-support-escalation","inbox-notifications","conversation-labeling","multi-agents-chatbot","fine-tuning-agent-intents","supervisor-overrides","byok-pricing-guide","working-with-tables","best-practices","help"]},{"group":"Function Calling","pages":["rag-from-external-data-provider"]},{"group":"API Usage Guides","pages":["api-reference/api-key-setup","api-reference/guide-00-chatbot-create","api-reference/guide-01-chat","api-reference/guide-02-source"]},{"group":"Authentication Webhook","pages":["user-identity"]},{"group":"Chatbots","pages":["api-reference/chatbots/properties-reference","api-reference/chatbots/create","api-reference/chatbots/update","api-reference/chatbots/fetch","api-reference/chatbots/fetch_multi","api-reference/chatbots/delete"]},{"group":"Agents","pages":["api-reference/agents/properties-reference","api-reference/agents/create","api-reference/agents/update","api-reference/agents/fetch_multi","api-reference/agents/delete"]},{"group":"Chatbot Sessions","pages":["api-reference/sessions/properties-reference","api-reference/sessions/create","api-reference/sessions/fetch","api-reference/sessions/fetch_multi","api-reference/sessions/delete","api-reference/sessions/delete_multi"]},{"group":"Session Messages","pages":["api-reference/messages/properties-reference","api-reference/messages/create","api-reference/messages/fetch_multi","api-reference/messages/delete","api-reference/messages/delete_multi"]},{"group":"Data Sources","pages":["api-reference/data-sources/properties-reference","api-reference/data-sources/create-file","api-reference/data-sources/create-qa","api-reference/data-sources/create-url","api-reference/data-sources/update","api-reference/data-sources/fetch_multi","api-reference/data-sources/retrain","api-reference/data-sources/delete","api-reference/data-sources/delete_multi"]},{"group":"Data Source Tags","pages":["api-reference/source-tags/create","api-reference/source-tags/fetch-multi","api-reference/source-tags/update","api-reference/source-tags/delete"]},{"group":"Tool Guides","pages":["tools/tools-intro"]},{"group":"Tools","pages":["tools/weekday"]},{"group":"Whitelabel Dashboard","pages":["whitelabel/whitelabel-intro","whitelabel/whitelabel-plans","whitelabel/whitelabel-users"]},{"group":"Integrations","pages":["whitelabel/whitelabel-zapier","whitelabel/whitelabel-make","whitelabel/whitelabel-meta"]}],"primaryTab":{"name":"Documentation"},"anchors":[{"name":"API References","url":"api-reference","icon":"code","_id":"676a0adaff1411a490c7299f"},{"name":"Tools","url":"tools","icon":"gear","_id":"676a0adaff1411a490c729a0"},{"name":"Whitelabel","url":"whitelabel","icon":"browser","_id":"676a0adaff1411a490c729a1"}],"repo":{"github":{"owner":"ks-collab","repo":"gpt-trainer-docs","contentDirectory":"","deployBranch":"main","isPrivate":false}}},"pageMetadata":{"title":"Best practices for preparing training data","description":null,"href":"/best-practices"}}},"mdxExtracts":{"tableOfContents":[{"title":"Chunk splitting","slug":"chunk-splitting","depth":2,"children":[]},{"title":"Chunk quality","slug":"chunk-quality","depth":2,"children":[]},{"title":"No gaps, no overlaps","slug":"no-gaps-no-overlaps","depth":2,"children":[]},{"title":"Remove unnecessary training data","slug":"remove-unnecessary-training-data","depth":2,"children":[]}],"codeExamples":{}},"description":null,"pageData":{"navWithMetadata":[{"group":"Getting Started","pages":[{"title":"Introduction","description":null,"href":"/introduction"}]},{"group":"Guides","pages":[{"title":"Create Your First Chatbot","description":null,"href":"/creating-first-chatbot"},{"title":"Lead Collection","description":null,"href":"/lead-collection"},{"title":"Human Support Escalation","description":null,"href":"/human-support-escalation"},{"title":"Inbox Notifications","description":null,"href":"/inbox-notifications"},{"title":"Conversation Labeling","description":null,"href":"/conversation-labeling"},{"title":"Multi-Agents Chatbot","description":null,"href":"/multi-agents-chatbot"},{"title":"Fine Tuning Agent Intents","description":null,"href":"/fine-tuning-agent-intents"},{"title":"AI Supervisor Overrides","description":null,"href":"/supervisor-overrides"},{"title":"Bring Your Own Key (BYOK) and Pricing","description":null,"href":"/byok-pricing-guide"},{"title":"Working with Tables and CSV","description":null,"href":"/working-with-tables"},{"title":"Best practices for preparing training data","description":null,"href":"/best-practices"},{"title":"Why does my chatbot not answer correctly?","description":null,"href":"/help"}]},{"group":"Function Calling","pages":[{"title":"RAG from an External Data Provider","description":null,"href":"/rag-from-external-data-provider"}]},{"group":"API Usage Guides","pages":[{"title":"Getting a GPT-trainer API Key","description":null,"href":"/api-reference/api-key-setup"},{"title":"Create Chabot","description":null,"href":"/api-reference/guide-00-chatbot-create"},{"title":"Chat with Chatbot","description":null,"href":"/api-reference/guide-01-chat"},{"title":"Uploading Data Sources","description":null,"href":"/api-reference/guide-02-source"}]},{"group":"Authentication Webhook","pages":[{"title":"User Identity Verification","description":null,"href":"/user-identity"}]},{"group":"Chatbots","pages":[{"title":"Chatbot Properties","description":"Detailed explanation of chatbot's properties","href":"/api-reference/chatbots/properties-reference"},{"title":"Create Chatbot","description":"Create a chatbot that belongs to the authenticated user","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/create","href":"/api-reference/chatbots/create"},{"title":"Update Chatbot","description":"Update chatbot meta base on uuid","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/update","href":"/api-reference/chatbots/update"},{"title":"Fetch a Chatbot","description":"Fetch single chatbot base on uuid","api":"GET https://app.gpt-trainer.com/api/v1/chatbot/{uuid}","href":"/api-reference/chatbots/fetch"},{"title":"Fetch all Chatbots","description":"Fetch the list of chatbots for current user","api":"GET https://app.gpt-trainer.com/api/v1/chatbots","href":"/api-reference/chatbots/fetch_multi"},{"title":"Delete Chatbot","description":"Delete single chatbot base on uuid","api":"DELETE https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/delete","href":"/api-reference/chatbots/delete"}]},{"group":"Agents","pages":[{"title":"Agent Properties","description":"Detailed explanation of agent's properties","href":"/api-reference/agents/properties-reference"},{"title":"Create Agent","description":"Create an agent for a chatbot specified by chatbot uuid","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/\u003cuuid\u003e/agent/create","href":"/api-reference/agents/create"},{"title":"Update Agent","description":"Update agent meta based on uuid","api":"POST https://app.gpt-trainer.com/api/v1/agent/{uuid}/update","href":"/api-reference/agents/update"},{"title":"Fetch all Agents","description":"Fetch the list of agents for a chatbot specified by chatbot uuid","api":"GET https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/agents","href":"/api-reference/agents/fetch_multi"},{"title":"Delete Agent","description":"Delete single agent base on uuid","api":"DELETE https://app.gpt-trainer.com/api/v1/agent/{uuid}/delete","href":"/api-reference/agents/delete"}]},{"group":"Chatbot Sessions","pages":[{"title":"Session Properties","description":"Detailed explanation of sessions's properties","href":"/api-reference/sessions/properties-reference"},{"title":"Create Session","description":"Create a chat session for a chatbot specified by chatbot uuid","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/session/create","href":"/api-reference/sessions/create"},{"title":"Fetch a Session","description":"Fetch single chatbot session base on uuid","api":"GET https://app.gpt-trainer.com/api/v1/session/{uuid}","href":"/api-reference/sessions/fetch"},{"title":"Fetch all Sessions","description":"Fetch the list of sessions for a chatbot specified by chatbot uuid","api":"GET https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/sessions","href":"/api-reference/sessions/fetch_multi"},{"title":"Delete Session","description":"Delete a session by its UUID","api":"POST https://app.gpt-trainer.com/api/v1/session/{uuid}/delete","href":"/api-reference/sessions/delete"},{"title":"Delete Session","description":"Delete a session by its UUID","api":"POST https://app.gpt-trainer.com/api/v1/session/{uuid}/delete","href":"/api-reference/sessions/delete_multi"}]},{"group":"Session Messages","pages":[{"title":"Message Properties","description":"Detailed explanation of message's properties","href":"/api-reference/messages/properties-reference"},{"title":"Create Message","description":"Create a session message for a chatbot session specified by session uuid","api":"POST https://app.gpt-trainer.com/api/v1/session/{uuid}/message/stream","href":"/api-reference/messages/create"},{"title":"Fetch all Messages","description":"Fetch the list of messages for a session specified by session uuid","api":"GET https://app.gpt-trainer.com/api/v1/session/{uuid}/messages","href":"/api-reference/messages/fetch_multi"},{"title":"Delete Message","description":"Delete single message base on uuid","api":"POST https://app.gpt-trainer.com/api/v1/message/{uuid}/delete","href":"/api-reference/messages/delete"},{"title":"Delete multiple Messages","description":"Delete list of messages base on their uuids","api":"POST https://app.gpt-trainer.com/api/v1/messages/delete","href":"/api-reference/messages/delete_multi"}]},{"group":"Data Sources","pages":[{"title":"Source properties","description":"Detailed explanation of source's properties","href":"/api-reference/data-sources/properties-reference"},{"title":"Upload a File","description":"Create a File source for a chatbot specified by chatbot uuid","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/data-source/upload","href":"/api-reference/data-sources/create-file"},{"title":"Create QA Source","description":"Create a QA source for a chatbot specified by chatbot uuid","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/data-source/qa","href":"/api-reference/data-sources/create-qa"},{"title":"Create URL Source","description":"Create a URL source for a chatbot specified by chatbot uuid","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/data-source/url","href":"/api-reference/data-sources/create-url"},{"title":"Update Source","description":"Update source meta base on uuid","api":"POST https://app.gpt-trainer.com/api/v1/data-source/{uuid}/update","href":"/api-reference/data-sources/update"},{"title":"Fetch list of Sources","description":"Fetch the list of sources for a chatbot specified by chatbot uuid","api":"GET https://app.gpt-trainer.com/api/v1/chatbot/{uuid}/data-sources","href":"/api-reference/data-sources/fetch_multi"},{"title":"Retrain Sources","description":"Retrain multiple URL data sources to fetch the latest content from them.","api":"POST https://app.gpt-trainer.com/api/v1/data-sources/url/re-scrape","href":"/api-reference/data-sources/retrain"},{"title":"Delete Source","description":"Delete single source base on uuid","api":"POST https://app.gpt-trainer.com/api/v1/data-source/{uuid}/delete","href":"/api-reference/data-sources/delete"},{"title":"Delete multiple Sources","description":"Delete list of sources base on their uuids","api":"POST https://app.gpt-trainer.com/api/v1/data-sources/delete","href":"/api-reference/data-sources/delete_multi"}]},{"group":"Data Source Tags","pages":[{"title":"Create Source Tag","description":"Create a source tag for a chabot. Source tags can be used to organize sources.","api":"POST https://app.gpt-trainer.com/api/v1/chatbot/\u003cuuid\u003e/source-tag/create","href":"/api-reference/source-tags/create"},{"title":"Fetch all Source Tags","description":"List all source tags for a chabot.","api":"GET https://app.gpt-trainer.com/api/v1/chatbot/\u003cuuid\u003e/source-tags","href":"/api-reference/source-tags/fetch-multi"},{"title":"Update Source Tag","description":"Update the properties of a source tag, including its list of documents.","api":"POST https://app.gpt-trainer.com/api/v1/source-tag/\u003cuuid\u003e/update","href":"/api-reference/source-tags/update"},{"title":"Delete Source Tag","description":"Delete a source tag based on uuid","api":"DELETE https://app.gpt-trainer.com/api/v1/source-tag/{uuid}/delete","href":"/api-reference/source-tags/delete"}]},{"group":"Tool Guides","pages":[{"title":"Introduction","description":null,"href":"/tools/tools-intro"}]},{"group":"Tools","pages":[{"title":"Weekday","description":"This function finds the day of the week, given a date. For example, given the date '2024-10-07', it will return a JSON: `{'weekday': 'Monday'}`","api":"GET https://tools.gpt-trainer.com/weekday","href":"/tools/weekday"}]},{"group":"Whitelabel Dashboard","pages":[{"title":"Introduction and first-time setup","description":null,"href":"/whitelabel/whitelabel-intro"},{"title":"Creating plans and pricing considerations","description":null,"href":"/whitelabel/whitelabel-plans"},{"title":"Managing your users","description":null,"href":"/whitelabel/whitelabel-users"}]},{"group":"Integrations","pages":[{"title":"Publishing a Zapier app","description":null,"href":"/whitelabel/whitelabel-zapier"},{"title":"Publishing a Make app","description":null,"href":"/whitelabel/whitelabel-make"},{"title":"Adding Meta integrations","description":null,"href":"/whitelabel/whitelabel-meta"}]}],"pageMetadata":{"title":"Best practices for preparing training data","description":null,"href":"/best-practices"},"mintConfig":{"layout":"topnav","sidebar":{"items":"container"},"topbar":{"style":"default"},"search":{"location":"top"},"rounded":"default","codeBlock":{"mode":"dark"},"$schema":"https://mintlify.com/schema.json","name":"GPT-trainer API","logo":{"light":"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/light.svg","dark":"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/dark.svg"},"favicon":"/logo/favicon.png","api":{"baseUrl":"https://app.gpt-trainer.com/api","auth":{"method":"bearer"}},"colors":{"primary":"#2E3F51","light":"#516F90","dark":"#0D001D","background":{"dark":"#111827"},"anchors":{"from":"#ED727B","to":"#F6B7BB"}},"topbarLinks":[{"url":"mailto:hello@gpt-trainer.com","name":"Support","_id":"676a0adaff1411a490c729a2"}],"navigation":[{"group":"Getting Started","pages":["introduction"]},{"group":"Guides","pages":["creating-first-chatbot","lead-collection","human-support-escalation","inbox-notifications","conversation-labeling","multi-agents-chatbot","fine-tuning-agent-intents","supervisor-overrides","byok-pricing-guide","working-with-tables","best-practices","help"]},{"group":"Function Calling","pages":["rag-from-external-data-provider"]},{"group":"API Usage Guides","pages":["api-reference/api-key-setup","api-reference/guide-00-chatbot-create","api-reference/guide-01-chat","api-reference/guide-02-source"]},{"group":"Authentication Webhook","pages":["user-identity"]},{"group":"Chatbots","pages":["api-reference/chatbots/properties-reference","api-reference/chatbots/create","api-reference/chatbots/update","api-reference/chatbots/fetch","api-reference/chatbots/fetch_multi","api-reference/chatbots/delete"]},{"group":"Agents","pages":["api-reference/agents/properties-reference","api-reference/agents/create","api-reference/agents/update","api-reference/agents/fetch_multi","api-reference/agents/delete"]},{"group":"Chatbot Sessions","pages":["api-reference/sessions/properties-reference","api-reference/sessions/create","api-reference/sessions/fetch","api-reference/sessions/fetch_multi","api-reference/sessions/delete","api-reference/sessions/delete_multi"]},{"group":"Session Messages","pages":["api-reference/messages/properties-reference","api-reference/messages/create","api-reference/messages/fetch_multi","api-reference/messages/delete","api-reference/messages/delete_multi"]},{"group":"Data Sources","pages":["api-reference/data-sources/properties-reference","api-reference/data-sources/create-file","api-reference/data-sources/create-qa","api-reference/data-sources/create-url","api-reference/data-sources/update","api-reference/data-sources/fetch_multi","api-reference/data-sources/retrain","api-reference/data-sources/delete","api-reference/data-sources/delete_multi"]},{"group":"Data Source Tags","pages":["api-reference/source-tags/create","api-reference/source-tags/fetch-multi","api-reference/source-tags/update","api-reference/source-tags/delete"]},{"group":"Tool Guides","pages":["tools/tools-intro"]},{"group":"Tools","pages":["tools/weekday"]},{"group":"Whitelabel Dashboard","pages":["whitelabel/whitelabel-intro","whitelabel/whitelabel-plans","whitelabel/whitelabel-users"]},{"group":"Integrations","pages":["whitelabel/whitelabel-zapier","whitelabel/whitelabel-make","whitelabel/whitelabel-meta"]}],"primaryTab":{"name":"Documentation"},"anchors":[{"name":"API References","url":"api-reference","icon":"code","_id":"676a0adaff1411a490c7299f"},{"name":"Tools","url":"tools","icon":"gear","_id":"676a0adaff1411a490c729a0"},{"name":"Whitelabel","url":"whitelabel","icon":"browser","_id":"676a0adaff1411a490c729a1"}],"repo":{"github":{"owner":"ks-collab","repo":"gpt-trainer-docs","contentDirectory":"","deployBranch":"main","isPrivate":false}}},"apiReferenceData":{}},"favicons":{"icons":[{"rel":"apple-touch-icon","sizes":"180x180","href":"https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/apple-touch-icon.png?v=3","type":"image/png"},{"rel":"icon","sizes":"32x32","href":"https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/favicon-32x32.png?v=3","type":"image/png"},{"rel":"icon","sizes":"16x16","href":"https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/favicon-16x16.png?v=3","type":"image/png"},{"rel":"shortcut icon","href":"https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/favicon.ico?v=3","type":"image/x-icon"}],"browserconfig":"https://mintlify.s3-us-west-1.amazonaws.com/paladinmaxinc/_generated/favicon/browserconfig.xml?v=3"},"subdomain":"guide.gpt-trainer.com","internalAnalyticsWriteKey":"phc_TXdpocbGVeZVm5VJmAsHTMrCofBQu3e0kN8HGMNGTVW","inkeep":{"integrationApiKey":"4f40617e2acf6b9193ebf897d3ed2d80d831b9c5b431b91d"},"trieve":{"datasetId":"5f00ed3a-c71f-499e-8e71-5e23e2290da3"},"shouldIndex":true,"org":{"createdAt":"2024-01-20T00:51:05.035Z"},"cssFiles":[],"jsFiles":[],"mdxSourceWithNoJs":{"compiledSource":"\"use strict\";\nconst {Fragment: _Fragment, jsx: _jsx, jsxs: _jsxs} = arguments[0];\nconst {useMDXComponents: _provideComponents} = arguments[0];\nfunction _createMdxContent(props) {\n  const _components = {\n    a: \"a\",\n    blockquote: \"blockquote\",\n    em: \"em\",\n    li: \"li\",\n    p: \"p\",\n    strong: \"strong\",\n    ul: \"ul\",\n    ..._provideComponents(),\n    ...props.components\n  }, {Frame, Heading, ZoomImage} = _components;\n  if (!Frame) _missingMdxReference(\"Frame\", true);\n  if (!Heading) _missingMdxReference(\"Heading\", true);\n  if (!ZoomImage) _missingMdxReference(\"ZoomImage\", true);\n  return _jsxs(_Fragment, {\n    children: [_jsx(_components.p, {\n      children: \"The accuracy and consistency of your chatbot depends on a number of factors:\"\n    }), \"\\n\", _jsxs(_components.ul, {\n      children: [\"\\n\", _jsx(_components.li, {\n        children: \"Quality of your training data\"\n      }), \"\\n\", _jsx(_components.li, {\n        children: \"Large language model (LLM) selection\"\n      }), \"\\n\", _jsx(_components.li, {\n        children: \"Explicitness of base prompt\"\n      }), \"\\n\"]\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"LLMs, like all statistics-based models, require training data during their construction. As they often say in the AI research community, “your model is only as good as your training data”.\\nThe best way to dictate and optimize your chatbot’s performance is to clean up its training data. In the following section, we provide some best practices for structuring your training data.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"LLMs do not “think” like humans do. They interpret and process data very differently from humans. To understand how the machine uses this data, we center our discussion on “chunks”.\"\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"chunk-splitting\",\n      children: \"Chunk splitting\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"During RAG, chunks are selected and injected into the user’s original input query, along with the base prompt. These chunks are derived directly from your\\nuploaded training data - PDFs, Word, websites, TXT files, etc. Since LLMs have token limits, we must also enforce constraints on the size of these chunks.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"This means that even if your original document has a long chapter of text that talks about a single topic, it will have to be divided into multiple chunks and stored separately within our vector database.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"So how can we divide up the document with minimal alterations to its original meaning?\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Unfortunately there is no universal solution. This is still an ongoing field of scientific research. GPT-trainer uses a combination of rule-based and statistical\\nrelevance algorithms to divide training data into chunks, but we cannot always guarantee each chunk is self-contained, clean, and accurate. Fortunately, LLMs\\nspecialize in working with unstructured text, and they have high tolerances for badly formatted input when producing responses.\"\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"chunk-quality\",\n      children: \"Chunk quality\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Another source of error comes from the chunk content itself. Optimally, each chunk should be self-contained, semantically self-consistent, and grammatically\\ncorrect. If document structure is important, each chunk should also have relevant metadata specifying where in the document it comes from. However,\\nnone of this can be guaranteed when chunks are initially extracted from uploaded text.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"This error is especially pronounced when working with websites. Since web browsers render websites very differently from how web scraper sees them, what\\nyou see can be very different from what our scraper captures. Furthermore, most layout information and data residing in images / illustrations / videos are lost during the scraping process.\"\n    }), \"\\n\", _jsx(Frame, {\n      children: _jsx(ZoomImage, {\n        children: _jsx(\"img\", {\n          src: \"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/images/best-practices-1.png\"\n        })\n      })\n    }), \"\\n\", _jsx(_components.p, {\n      children: _jsxs(_components.em, {\n        children: [\"GPT-trainer’s own pricing table on \", _jsx(_components.a, {\n          href: \"https://gpt-trainer.com/pricing\",\n          children: \"https://gpt-trainer.com/pricing\"\n        }), \" as rendered via the Chrome browser.\"]\n      })\n    }), \"\\n\", _jsx(Frame, {\n      children: _jsx(ZoomImage, {\n        children: _jsx(\"img\", {\n          src: \"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/images/best-practices-2.png\"\n        })\n      })\n    }), \"\\n\", _jsx(_components.p, {\n      children: _jsx(_components.em, {\n        children: \"The same website content, after our scraper captures it and associated chunking has been done.\"\n      })\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"no-gaps-no-overlaps\",\n      children: \"No gaps, no overlaps\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"RAG relies on dynamically fetching a subset of reference data from the entire collection of training materials.\\nTo identify which chunks contain the most relevant information, the user query goes through the same embedding process\\nas the chunks themselves. Then, we calculate a relevance score for every chunk based on the proximity of their embedding\\nvectors relative to the user’s input (cosine distance). Afterwards, the chunks are ranked, and our algorithm picks the top\\nn chunks that can fit into the reserved token window for the chosen LLM.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Since the algorithm tries to discover and fit as many relevant chunks as possible, there is the possibility that chunks\\ncontaining semantically similar, but factually inconsistent information are simultaneously injected into the reference context of the LLM call.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"For example, if the user asks:\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: _jsx(_components.em, {\n        children: _jsx(_components.strong, {\n          children: \"What is the price of iPhone SE?\"\n        })\n      })\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"then, the algorithm may pull the following chunks to serve as reference context:\"\n    }), \"\\n\", _jsxs(_components.blockquote, {\n      children: [\"\\n\", _jsx(_components.p, {\n        children: \"[Chunk 1] iPhone SE’s current price is $250.\"\n      }), \"\\n\", _jsx(_components.p, {\n        children: \"[Chunk 2] Original iPhone SE is $199.\"\n      }), \"\\n\", _jsx(_components.p, {\n        children: \"[Chunk 3] iPhone 5’s price is $600.\"\n      }), \"\\n\"]\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"As you can see, these chunks all explicitly mention the price of iPhone SE, so they are semantically similar to the user’s original query.\\nHowever, they contain factually inconsistent information. When this happens, you may notice the AI generating different responses each\\ntime even if the same question was asked.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"To ensure better consistency, we recommend that you adopt a “MECE” approach when uploading your training data. MECE stands for Mutually\\nExclusive, Collectively Exhaustive. In other words - no gaps, no overlaps. If your training data is structured in this way, then you minimize\\nthe chances of conflicting information being fed to the LLM during RAG, thereby ensuring that your chatbot behaves in a more predictable and intended fashion.\"\n    }), \"\\n\", _jsx(Heading, {\n      level: \"2\",\n      id: \"remove-unnecessary-training-data\",\n      children: \"Remove unnecessary training data\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"RAG works by matching semantically similar chunks to the user’s input query. Since LLMs have token restrictions, we can only\\nfit a limited number of chunks to serve as reference context. Therefore, if your overall knowledge base is large, then the percentage of\\ninformation that can be pulled into the LLM query each time is small.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"For instance, if you have 20 chunks worth of training data and the LLM can pull in 10 chunks to serve as reference context each time, then\\neach user query can make use of 50% of the entire knowledge base. On the other hand, if you have 2000 chunks total, then each user query can o\\nnly pull 0.5% of the entire knowledge base.\"\n    }), \"\\n\", _jsx(_components.p, {\n      children: \"Larger knowledge bases make it less likely for the RAG algorithm to identify relevant information. Rather than dumping everything in, having a\\nfocused set of training data significantly improves the chatbot’s performance.\"\n    })]\n  });\n}\nfunction MDXContent(props = {}) {\n  const {wrapper: MDXLayout} = {\n    ..._provideComponents(),\n    ...props.components\n  };\n  return MDXLayout ? _jsx(MDXLayout, {\n    ...props,\n    children: _jsx(_createMdxContent, {\n      ...props\n    })\n  }) : _createMdxContent(props);\n}\nreturn {\n  default: MDXContent\n};\nfunction _missingMdxReference(id, component) {\n  throw new Error(\"Expected \" + (component ? \"component\" : \"object\") + \" `\" + id + \"` to be defined: you likely forgot to import, pass, or provide it.\");\n}\n","frontmatter":{},"scope":{"mintConfig":{"$schema":"https://mintlify.com/schema.json","name":"GPT-trainer API","logo":{"light":"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/light.svg","dark":"https://mintlify.s3.us-west-1.amazonaws.com/paladinmaxinc/logo/dark.svg"},"favicon":"/logo/favicon.png","api":{"baseUrl":"https://app.gpt-trainer.com/api","auth":{"method":"bearer"}},"colors":{"primary":"#2E3F51","light":"#516F90","dark":"#0D001D","background":{"dark":"#111827"},"anchors":{"from":"#ED727B","to":"#F6B7BB"}},"topbarLinks":[{"url":"mailto:hello@gpt-trainer.com","name":"Support","_id":"676a0adaff1411a490c729a2"}],"navigation":[{"group":"Getting Started","pages":["introduction"]},{"group":"Guides","pages":["creating-first-chatbot","lead-collection","human-support-escalation","inbox-notifications","conversation-labeling","multi-agents-chatbot","fine-tuning-agent-intents","supervisor-overrides","byok-pricing-guide","working-with-tables","best-practices","help"]},{"group":"Function Calling","pages":["rag-from-external-data-provider"]},{"group":"API Usage Guides","pages":["api-reference/api-key-setup","api-reference/guide-00-chatbot-create","api-reference/guide-01-chat","api-reference/guide-02-source"]},{"group":"Authentication Webhook","pages":["user-identity"]},{"group":"Chatbots","pages":["api-reference/chatbots/properties-reference","api-reference/chatbots/create","api-reference/chatbots/update","api-reference/chatbots/fetch","api-reference/chatbots/fetch_multi","api-reference/chatbots/delete"]},{"group":"Agents","pages":["api-reference/agents/properties-reference","api-reference/agents/create","api-reference/agents/update","api-reference/agents/fetch_multi","api-reference/agents/delete"]},{"group":"Chatbot Sessions","pages":["api-reference/sessions/properties-reference","api-reference/sessions/create","api-reference/sessions/fetch","api-reference/sessions/fetch_multi","api-reference/sessions/delete","api-reference/sessions/delete_multi"]},{"group":"Session Messages","pages":["api-reference/messages/properties-reference","api-reference/messages/create","api-reference/messages/fetch_multi","api-reference/messages/delete","api-reference/messages/delete_multi"]},{"group":"Data Sources","pages":["api-reference/data-sources/properties-reference","api-reference/data-sources/create-file","api-reference/data-sources/create-qa","api-reference/data-sources/create-url","api-reference/data-sources/update","api-reference/data-sources/fetch_multi","api-reference/data-sources/retrain","api-reference/data-sources/delete","api-reference/data-sources/delete_multi"]},{"group":"Data Source Tags","pages":["api-reference/source-tags/create","api-reference/source-tags/fetch-multi","api-reference/source-tags/update","api-reference/source-tags/delete"]},{"group":"Tool Guides","pages":["tools/tools-intro"]},{"group":"Tools","pages":["tools/weekday"]},{"group":"Whitelabel Dashboard","pages":["whitelabel/whitelabel-intro","whitelabel/whitelabel-plans","whitelabel/whitelabel-users"]},{"group":"Integrations","pages":["whitelabel/whitelabel-zapier","whitelabel/whitelabel-make","whitelabel/whitelabel-meta"]}],"primaryTab":{"name":"Documentation"},"anchors":[{"name":"API References","url":"api-reference","icon":"code","_id":"676a0adaff1411a490c7299f"},{"name":"Tools","url":"tools","icon":"gear","_id":"676a0adaff1411a490c729a0"},{"name":"Whitelabel","url":"whitelabel","icon":"browser","_id":"676a0adaff1411a490c729a1"}],"repo":{"github":{"owner":"ks-collab","repo":"gpt-trainer-docs","contentDirectory":"","deployBranch":"main","isPrivate":false}}},"pageMetadata":{"title":"Best practices for preparing training data","description":null,"href":"/best-practices"}}}},"__N_SSG":true},"page":"/_sites/[subdomain]/[[...slug]]","query":{"subdomain":"guide.gpt-trainer.com","slug":["best-practices"]},"buildId":"pChs_9tFT1YAEINLWWPhQ","isFallback":false,"isExperimentalCompile":false,"gsp":true,"scriptLoader":[]}</script>
<!-- Mirrored from guide.gpt-trainer.com/best-practices by HTTrack Website Copier/3.x [XR&CO'2014], Tue, 07 Jan 2025 14:53:24 GMT -->
</html></body></html>