/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include using executorch::aten::ScalarType; using executorch::extension::Module; using executorch::extension::llm::BPETokenizer; using executorch::extension::llm::Sampler; using executorch::runtime::Error; namespace example { #define SAMPLER_TOP 0.9f #define ENDOFTEXT_TOKEN 32000 #define VOCABULARY_SIZE 32064 Runner::Runner( const std::string& model_path, const std::string& tokenizer_path, const float temperature) : module_(std::make_unique(model_path, Module::LoadMode::File)), tokenizer_(std::make_unique()), sampler_(std::make_unique( VOCABULARY_SIZE, temperature, SAMPLER_TOP, static_cast(std::time(nullptr)))) { ET_CHECK_MSG( tokenizer_->load(tokenizer_path) == Error::Ok, "Failed to load tokenizer at %s", tokenizer_path.c_str()); ET_LOG( Info, "Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s", model_path.c_str(), tokenizer_path.c_str()); } void Runner::generate(const std::string& prompt, std::size_t max_seq_len) { auto encode_res = tokenizer_->encode(prompt, 0, 0); ET_CHECK_MSG( encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str()); auto input_tokens = encode_res.get(); auto prev_token = input_tokens.back(); auto current_token = prefill(input_tokens); std::cout << tokenizer_->decode(prev_token, current_token).get(); std::cout.flush(); std::size_t seq_len = input_tokens.size() + 1; while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) { prev_token = current_token; current_token = run_model_step(current_token); std::cout << tokenizer_->decode(prev_token, current_token).get(); std::cout.flush(); ++seq_len; } std::cout << std::endl; } uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) { return sampler_->sample(logits_tensor.data_ptr()); } uint64_t Runner::prefill(std::vector& tokens) { auto result = module_->forward(executorch::extension::from_blob( tokens.data(), {1, static_cast(tokens.size())}, ScalarType::Long)); ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens"); return logits_to_token(result.get()[0].toTensor()); } uint64_t Runner::run_model_step(uint64_t token) { auto result = module_->forward( executorch::extension::from_blob(&token, {1, 1}, ScalarType::Long)); ET_CHECK_MSG( result.error() == Error::Ok, "Failed to run forward() for token %" PRIu64, token); return logits_to_token(result.get()[0].toTensor()); } } // namespace example