digitalmars.D - This one simple trick allows D programmer use llama.cpp, rust
- evilrat (197/197) Mar 21 Just like the title says, no need to make bindings anymore, just
- Andrea Fontana (2/6) Mar 21 Wow! :)
- Andrea Fontana (14/19) Mar 21 Tested on linux, it works too!
- Serg Gini (5/9) Mar 21 Nice catch. Thanks for sharing!
- Martyn (8/17) Mar 22 This is really awesome!
- Chris Piker (4/9) Mar 26 Can confirm with the NAIF SPICE library. Tested with dmd v2.103
Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file. Tested with dmd v2.107 on Windows. __llamad.c__: ```c #include "llama.h" ``` A ported D version of `simple` example from llama.cpp __llamad.d__: ```d module llama_d; import std.string; import std.stdio; import llamad; // imports llamad.c // pragma(msg, __traits(allMembers, llamad)); void main(string[] args) { if (args.length < 3) { writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> <your_prompt>"); return; } llama_backend_init(); llama_numa_init(GGML_NUMA_STRATEGY_DISABLED); auto mparams = llama_model_default_params(); // mparams.n_gpu_layers = 30; // offload layers to the GPU to accelerate inference auto ctx_params = llama_context_default_params(); ctx_params.n_ctx = 2048; import std.parallelism; ctx_params.n_threads = totalCPUs-1; ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 ? ctx_params.n_threads : ctx_params.n_threads_batch; llama_model* model = llama_load_model_from_file(toStringz(args[1]), mparams); llama_context* ctx = llama_new_context_with_model(model, ctx_params); const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM; const bool allow_special = false; string prompt = args[2]; if (!prompt.length) return; // convert prompt to embedings llama_token[] embd_inp; embd_inp.length = prompt.length; writeln("tokenizing..."); auto n_of_tok = llama_tokenize(llama_get_model(ctx), prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) embd_inp.length, add_bos, allow_special); embd_inp.length = n_of_tok; if (!n_of_tok) { writeln("no tokens generated, something gone wrong"); return; } writeln("input has ", n_of_tok, " tokens"); foreach (id; embd_inp) { write(llama_token_to_piece(ctx, id)); } writeln(); // total length of the sequence including the prompt const int n_len = 128; const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = cast(int)(embd_inp.length + (n_len - embd_inp.length)); if (n_kv_req > n_ctx) { writeln("error: prompt is too long"); return; } writeln("building batch"); // create a llama_batch with size 512 // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(512, 0, 1); // evaluate the initial prompt for (size_t i = 0; i < embd_inp.length; i++) { // note that seq_pos_id = [0] is required as there should be at least one token llama_batch_add(batch, embd_inp[i], cast(int) i, [0], false); } // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; writeln("decoding batch"); if (llama_decode(ctx, batch) != 0) { writeln("llama_decode() failed"); return; } // main loop int n_cur = batch.n_tokens; int n_decode = 0; const auto t_main_start = ggml_time_us(); while (n_cur <= n_len) { // sample the next token { auto n_vocab = llama_n_vocab(model); auto logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); llama_token_data[] candidates; candidates.length = n_vocab; for (llama_token token_id = 0; token_id < n_vocab; token_id++) { candidates ~= llama_token_data(token_id, logits[token_id], 0.0f); } llama_token_data_array candidates_p = { candidates.ptr, cast(int) candidates.length, false }; // sample the most likely token const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? if (new_token_id == llama_token_eos(model) || n_cur == n_len) { writeln(); break; } writef("%s", llama_token_to_piece(ctx, new_token_id)); // prepare the next batch llama_batch_clear(batch); // push this new token for next evaluation llama_batch_add(batch, new_token_id, n_cur, [0], true); n_decode += 1; } n_cur += 1; // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { writefln("%s : failed to eval, return code %d\n", __FUNCTION__, 1); return; } } const auto t_main_end = ggml_time_us(); llama_print_timings(ctx); writeln(); // cleanup llama_batch_free(batch); llama_free(ctx); llama_free_model(model); llama_backend_free(); } void llama_batch_add( ref llama_batch batch, llama_token id, llama_pos pos, const llama_seq_id[] seq_ids, bool logits) { batch.token [batch.n_tokens] = id; batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length; for (size_t i = 0; i < seq_ids.length; ++i) { batch.seq_id[batch.n_tokens][i] = seq_ids[i]; } batch.logits [batch.n_tokens] = logits; batch.n_tokens++; } string llama_token_to_piece(llama_context* ctx, llama_token token) { char[] result; result.length = 8; const int n_tokens = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length); if (n_tokens < 0) { result.length = -n_tokens; int check = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length); assert(check == -n_tokens); } else { result.length = n_tokens; } return cast(string) result; } void llama_batch_clear(ref llama_batch batch) { batch.n_tokens = 0; } ``` Build inside llama.cpp folder with this command (I've been using CUDA but it is possible to use without it) ```bat dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib -L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib -L/NODEFAULTLIB:libcmt.lib msvcprtd.lib ``` And **run** ```bat llama-d "E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizar lm-13b.Q5_K_M.gguf" "How to quit vim?" ```
Mar 21
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.Wow! :)
Mar 21
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file. Tested with dmd v2.107 on Windows.Tested on linux, it works too! ``` <dummy00022>, why is sky blue? The sky is blue because of the way that the atmosphere scatters sunlight. Blue light has more energy than the other colors in the visible spectrum, so it's more likely to penetrate the atmosphere and reach our eyes. This causes the blue light to be scattered in all directions, making the sky appear blue. Additionally, the color of the sky can change depending on the weather and the time of day. For example, the sky may appear more orange or red during sunrise or sunset when the sun is low ``` Andrea
Mar 21
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.Nice catch. Thanks for sharing! Another option is using pure D implementation based on llama2.c code from Karpathy https://github.com/cyrusmsk/llama2.d
Mar 21
On Thursday, 21 March 2024 at 22:42:31 UTC, Serg Gini wrote:On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:This is really awesome! Only issue I have with is the title. It should have ended with either:- .. and thats a good thing! or .. lets get into it. You are now a blogger, a journalist, a "youtuber" lolJust like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.Nice catch. Thanks for sharing! Another option is using pure D implementation based on llama2.c code from Karpathy https://github.com/cyrusmsk/llama2.d
Mar 22
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file. Tested with dmd v2.107 on Windows.Can confirm with the NAIF SPICE library. Tested with dmd v2.103 on Linux. Nice!
Mar 26