Ryzen_AI_Example/main.cpp at main · Jays-1111/Ryzen_AI_Example · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <cstdlib>

#include "ort_genai.h"
#include "util/prompt_reader.h"
//
#include <sstream>
#if _WIN32
extern "C" {
#include "util/getopt.h"
}
#include <codecvt>
#include <locale>
using convert_t = std::codecvt_utf8<wchar_t>;
std::wstring_convert<convert_t, wchar_t> strconverter;
#else
#include <getopt.h>
#include <cstring>
#include <sstream>
#endif

// C++ API Example
void CXX_API(const char* model_path, int max_new_tokens, std::istream& istr, bool use_chat_template, std::string inp_prompt_len, int max_seq_len) {

  double duration = 0.0;

  std::cout << "Creating model..." << std::endl;
  try {
    auto start = std::chrono::steady_clock::now();
    auto model = OgaModel::Create(model_path);
    auto end = std::chrono::steady_clock::now();

    duration = std::chrono::duration<double, std::milli>(end - start).count();
    std::cout << "\nSession Create time (ms): " << duration << std::endl << std::endl;

    std::cout << "Creating tokenizer..." << std::endl;
    auto tokenizer = OgaTokenizer::Create(*model);
    auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);

    // Enable for profiling
    const char* prof_var = "EN_PROFILE";
    bool en_profile = 0;

    // Retrieve the environment variable as a string
    const char* env_profile_dump = getenv(prof_var);

    if (env_profile_dump != nullptr)
      en_profile = std::stoi(env_profile_dump);

    std::vector<int>prompt_lengths ;
    std::stringstream ss(inp_prompt_len);

    // Parse the string
    for (int i; ss >> i;) {
        prompt_lengths.push_back(i);
        if (ss.peek() == ',')
            ss.ignore();
    }

    int prompt_ind=0;
    int input_prompt_size=prompt_lengths.size();

    while (true) {
      std::string text;

      // Ask for prompt when prompt is not from file
      if (&istr == &std::cin)
        std::cout << "Enter Prompt: " << std::endl;

      if (getline_in_a_loop(istr, text) == false)
        break;

      std::string prompt;
      if (use_chat_template)
        prompt = "<|im_start|>user\n" + text + "<|im_end|>\n<|im_start|>assistant\n";
      else
        prompt = text;


      auto sequences = OgaSequences::Create();
      tokenizer->Encode(prompt.c_str(), *sequences);

      const OgaSequences* p = sequences.get();
      auto token_sequences = (*reinterpret_cast<const std::vector<std::vector<int>>*>(p));
      auto tokens = token_sequences[0];

      auto prompt_sequences = OgaSequences::Create();
      OgaSequences* new_sequences = prompt_sequences.get();
      auto& prompt_tokenSequences = (*reinterpret_cast<std::vector<std::vector<int>>*>(new_sequences));
      std::vector<int32_t> prompt_tokens;

      if (prompt_ind < input_prompt_size && sequences->SequenceCount(0) > prompt_lengths[prompt_ind]){
        std::cout << "Reducing the number of tokens for input prompt from " << sequences->SequenceCount(0) << " to "<<
        prompt_lengths[prompt_ind]<< std::endl;
        prompt_tokens.assign(tokens.begin(),tokens.begin()+prompt_lengths[prompt_ind]);
      }
      else{
        prompt_tokens = (tokens);
      }

      prompt_tokenSequences.emplace_back(prompt_tokens);
      prompt_ind++;

      if (prompt_sequences->SequenceCount(0) > 2048) {
        std::cout << "\nPrompt Number of Tokens: " << prompt_sequences->SequenceCount(0) << std::endl;
        std::cout << "Error: Only <= 2048 Prompt length is supported currently!\n" << std::endl;
        continue;
      }

      std::cout << "Generating response..." << std::endl;
      auto params = OgaGeneratorParams::Create(*model);

      params->SetSearchOption("max_length", max_seq_len);


      double duration = 0, duration_prefil = 0, token_time = 0;
      unsigned int ntokens = 0;
      unsigned int defaul_token = 20;
      unsigned int max_tkns = (max_new_tokens < defaul_token)? defaul_token : max_new_tokens;

      auto generator = OgaGenerator::Create(*model, *params);
      std::cout << "Generator created ..." << std::endl;

      auto start_prompt_processing = std::chrono::steady_clock::now();
      generator->AppendTokenSequences(*prompt_sequences);
      auto end_prompt_processing = std::chrono::steady_clock::now();
      duration_prefil = std::chrono::duration<double, std::milli>(end_prompt_processing - start_prompt_processing).count();
      generator->GenerateNextToken();
      const auto new_token = generator->GetSequenceData(0)[generator->GetSequenceCount(0) - 1];
      std::cout << tokenizer_stream->Decode(new_token) << std::flush;

      while (!generator->IsDone()) {
        auto start = std::chrono::steady_clock::now();

        generator->GenerateNextToken();

        auto end = std::chrono::steady_clock::now();
        duration = std::chrono::duration<double, std::milli>(end - start).count();
        token_time += duration;
        ntokens++;

        const auto num_tokens = generator->GetSequenceCount(0);
        const auto new_token = generator->GetSequenceData(0)[num_tokens - 1];
        std::cout << tokenizer_stream->Decode(new_token) << std::flush;

        if (ntokens == max_tkns)
          break;
      }
      std::cout << std::endl << std::endl;

      std::cout << "\nPrompt length: " << prompt_sequences->SequenceCount(0) << std::endl;
      std::cout << "New tokens: " << ntokens << std::endl << std::endl;

      std::cout << "Prompt TTFT (ms): " << duration_prefil << std::endl;
      std::cout << "Tokens / sec: " << ntokens / (token_time / 1000.0) << std::endl;

      for (int i = 0; i < 3; ++i)
        std::cout << std::endl;

      if (en_profile)
        break;
    }
  } catch (std::exception &e) {
    std::cerr << "Error: " << e.what() << std::endl;
  }
}

static void usage(const char *name) {
  std::cout
      << "usage: " << name << "\n"
         " [-m: model path]\n"
         " [-f: prompt file]\n"
         " [-n: max new tokens]\n"
         " [-c: use chat template]\n"
         " [-t: input prompt token length]\n"
         " [-l: max length to be set in search options]\n"
         " [-h: help]\n"
      << std::endl;
}

int main(int argc, char** argv) {
  std::cout << "Model Generate using OGA ..." << std::endl;
  int opt = 0;
  bool use_chat_template = false;
  std::string model_path;
  std::string prompt_file;
  int max_new_tokens = 32;
  int num_iterations = 1;
  int max_seq_len = 4096;
  std::string inp_prompt_len;
  while ((opt = getopt(argc, argv, "chm:f:n:t:l:")) != -1) {
    switch (opt) {
    case 'c':
      use_chat_template = true;
      break;
    case 'm':
      model_path = std::string(optarg);
      break;
    case 'n':
      max_new_tokens = std::stoi(optarg);
      break;
    case 'f':
      prompt_file = std::string(optarg);
      break;
    case 't':
      inp_prompt_len = std::string(optarg);
      break;
    case 'l':
      max_seq_len = std::stoi(optarg);
      break;
    case 'h':
      usage(argv[0]);
      exit(1);
    default:
      usage(argv[0]);
      exit(1);
    }
  }

  if (optind > argc) {
    usage(argv[0]);
    exit(1);
  }

  std::cout << "- Model Path: " << model_path << std::endl;
  std::cout << "- Prompt File: " << prompt_file << std::endl;
  std::cout << "- Max new tokens.: " << max_new_tokens << std::endl;
  std::cout << "- Use chat template: " << use_chat_template << std::endl;
  std::cout << "- Input prompt token length : " << inp_prompt_len << std::endl;
  std::cout << "- Max length : " << max_seq_len << std::endl;

  std::cout << "---------------------------------------" << std::endl;

  // Responsible for cleaning up the library during shutdown
  OgaHandle handle;

  if (prompt_file.empty()) {
    CXX_API(model_path.c_str(), max_new_tokens, std::cin, use_chat_template, inp_prompt_len, max_seq_len);
  } else {
    std::ifstream ifs(prompt_file.c_str());
    if (!ifs.is_open()) {
      std::cerr << "Couldn't open file : " << prompt_file << std::endl;
      return -1;
    }
    CXX_API(model_path.c_str(), max_new_tokens, ifs, use_chat_template, inp_prompt_len, max_seq_len);
  }

  return 0;
}