zeroentropy-node/src/resources/models.ts at main · zeroentropy-ai/zeroentropy-node · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

import { APIResource } from '../resource';
import * as Core from '../core';

export class Models extends APIResource {
  /**
   * Embeds the provided input text with ZeroEntropy embedding models.
   *
   * The results will be returned in the same order as the text provided. The
   * embedding is such that queries will have high cosine similarity with documents
   * that are relevant to that query.
   *
   * Organizations will, by default, have a ratelimit of `2,500,000` bytes-per-minute
   * and 1000 QPM. Ratelimits are refreshed every 15 seconds. If this is exceeded,
   * requests will be throttled into `latency: "slow"` mode, up to `20,000,000`
   * bytes-per-minute. If even this is exceeded, you will get a `429` error. To
   * request higher ratelimits, please contact
   * [founders@zeroentropy.dev](mailto:founders@zeroentropy.dev) or message us on
   * [Discord](https://go.zeroentropy.dev/discord) or
   * [Slack](https://go.zeroentropy.dev/slack)!
   */
  embed(body: ModelEmbedParams, options?: Core.RequestOptions): Core.APIPromise<ModelEmbedResponse> {
    return this._client.post('/models/embed', { body, ...options });
  }

  /**
   * Reranks the provided documents, according to the provided query.
   *
   * The results will be sorted by descending order of relevance. For each document,
   * the index and the score will be returned. The index is relative to the documents
   * array that was passed in. The score is the query-document relevancy determined
   * by the reranker model. The results will be returned in descending order of
   * relevance.
   *
   * Organizations will, by default, have a ratelimit of `2,500,000` bytes-per-minute
   * and 1000 QPM. Ratelimits are refreshed every 15 seconds. If this is exceeded,
   * requests will be throttled into `latency: "slow"` mode, up to `20,000,000`
   * bytes-per-minute. If even this is exceeded, you will get a `429` error. To
   * request higher ratelimits, please contact
   * [founders@zeroentropy.dev](mailto:founders@zeroentropy.dev) or message us on
   * [Discord](https://go.zeroentropy.dev/discord) or
   * [Slack](https://go.zeroentropy.dev/slack)!
   */
  rerank(body: ModelRerankParams, options?: Core.RequestOptions): Core.APIPromise<ModelRerankResponse> {
    return this._client.post('/models/rerank', { body, ...options });
  }
}

export interface ModelEmbedResponse {
  /**
   * The list of embedding results.
   */
  results: Array<ModelEmbedResponse.Result>;

  /**
   * Statistics regarding the tokens used by the request.
   */
  usage: ModelEmbedResponse.Usage;
}

export namespace ModelEmbedResponse {
  export interface Result {
    /**
     * The embedding of the input text, as an array of floats. If `base64` format is
     * requested, the response will be an fp32 little endian byte array, encoded as a
     * base64 string.
     */
    embedding: Array<number> | string;
  }

  /**
   * Statistics regarding the tokens used by the request.
   */
  export interface Usage {
    /**
     * The total number of bytes in the request. This is used for ratelimiting.
     */
    total_bytes: number;

    /**
     * The total number of tokens in the request. This is used for billing.
     */
    total_tokens: number;
  }
}

export interface ModelRerankResponse {
  /**
   * The type of inference actually used. If `auto` is requested, then `fast` will be
   * used by default, with `slow` as a fallback if your ratelimit is exceeded. Else,
   * this field will be identical to the requested latency mode.
   */
  actual_latency_mode: 'fast' | 'slow';

  /**
   * The total time, in seconds, between rerank request received and rerank response
   * returned. Client latency should equal `e2e_latency` + your ping to ZeroEntropy's
   * API.
   */
  e2e_latency: number;

  /**
   * The time, in seconds, to actually inference the request. If this is
   * significantly lower than `e2e_latency`, this is likely due to ratelimiting.
   * Please request a higher ratelimit at
   * [founders@zeroentropy.dev](mailto:founders@zeroentropy.dev) or message us on
   * [Discord](https://go.zeroentropy.dev/discord) or
   * [Slack](https://go.zeroentropy.dev/slack)!
   */
  inference_latency: number;

  /**
   * The results, ordered by descending order of relevance to the query.
   */
  results: Array<ModelRerankResponse.Result>;

  /**
   * The total number of bytes in the request. This is used for ratelimiting.
   */
  total_bytes: number;

  /**
   * The total number of tokens in the request. This is used for billing.
   */
  total_tokens: number;
}

export namespace ModelRerankResponse {
  export interface Result {
    /**
     * The index of this document, relative to the original document array passed into
     * the request.
     */
    index: number;

    /**
     * The relevance score between this document and the query. This number will range
     * between 0.0 and 1.0. This score is dependent on only the query and the scored
     * document; other documents do not affect this score. This value is intended to be
     * deterministic, but it may vary slightly due to floating point error.
     */
    relevance_score: number;
  }
}

export interface ModelEmbedParams {
  /**
   * The string, or list of strings, to embed.
   */
  input: string | Array<string>;

  /**
   * The input type. For retrieval tasks, either `query` or `document`.
   */
  input_type: 'query' | 'document';

  /**
   * The model ID to use for embedding. Options are: ["zembed-1"]
   */
  model: string;

  /**
   * The output dimensionality of the embedding model. For `zembed-1`, the available
   * options are: [2560, 1280, 640, 320, 160, 80, 40].
   */
  dimensions?: number | null;

  /**
   * The output format of the embedding. If `float`, an array of floats will be
   * returned for each embeddings. If `base64`, a f32 little endian byte array will
   * be returned, encoded as a base64 string. `base64` is significantly more
   * efficient than `float`. The default is `float`.
   */
  encoding_format?: 'float' | 'base64';

  /**
   * Whether the call will be inferenced "fast" or "slow". RateLimits for slow API
   * calls are orders of magnitude higher, but you can expect 2-20 second latency.
   * Fast inferences are guaranteed subsecond, but rate limits are lower. If not
   * specified, first a "fast" call will be attempted, but if you have exceeded your
   * fast rate limit, then a slow call will be executed. If explicitly set to "fast",
   * then 429 will be returned if it cannot be executed fast.
   */
  latency?: 'fast' | 'slow' | null;
}

export interface ModelRerankParams {
  /**
   * The list of documents to rerank. Each document is a string.
   */
  documents: Array<string>;

  /**
   * The model ID to use for reranking. Options are: ["zerank-2", "zerank-1",
   * "zerank-1-small"]
   */
  model: string;

  /**
   * The query to rerank the documents by.
   */
  query: string;

  /**
   * Whether the call will be inferenced "fast" or "slow". RateLimits for slow API
   * calls are orders of magnitude higher, but you can expect >10 second latency.
   * Fast inferences are guaranteed subsecond, but rate limits are lower. If not
   * specified, first a "fast" call will be attempted, but if you have exceeded your
   * fast rate limit, then a slow call will be executed. If explicitly set to "fast",
   * then 429 will be returned if it cannot be executed fast.
   */
  latency?: 'fast' | 'slow' | null;

  /**
   * If provided, then only the top `n` documents will be returned in the results
   * array. Otherwise, `n` will be the length of the provided documents array.
   */
  top_n?: number | null;
}

export declare namespace Models {
  export {
    type ModelEmbedResponse as ModelEmbedResponse,
    type ModelRerankResponse as ModelRerankResponse,
    type ModelEmbedParams as ModelEmbedParams,
    type ModelRerankParams as ModelRerankParams,
  };
}