www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - This one simple trick allows D programmer use llama.cpp, rust

reply evilrat <evilrat666 gmail.com> writes:
Just like the title says, no need to make bindings anymore, just 
make a dummy C file with single include and start building your D 
app powered with llama.cpp, that's it, no extra work needed 
except making a dummy file.

Tested with dmd v2.107 on Windows.



__llamad.c__:

```c
#include "llama.h"
```

A ported D version of `simple` example from llama.cpp

__llamad.d__:

```d
module llama_d;

import std.string;
import std.stdio;

import llamad; // imports llamad.c

// pragma(msg, __traits(allMembers, llamad));

void main(string[] args)
{
     if (args.length < 3) {
         writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> 
<your_prompt>");
         return;
     }

     llama_backend_init();
     llama_numa_init(GGML_NUMA_STRATEGY_DISABLED);

     auto mparams = llama_model_default_params();
     // mparams.n_gpu_layers = 30; // offload layers to the GPU to 
accelerate inference

     auto ctx_params = llama_context_default_params();
     ctx_params.n_ctx = 2048;

     import std.parallelism;
     ctx_params.n_threads = totalCPUs-1;
     ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 
? ctx_params.n_threads : ctx_params.n_threads_batch;

     llama_model*  model = 
llama_load_model_from_file(toStringz(args[1]), mparams);
     llama_context*	ctx = llama_new_context_with_model(model, 
ctx_params);

     const bool add_bos = llama_vocab_type(model) == 
LLAMA_VOCAB_TYPE_SPM;
     const bool allow_special = false;

     string prompt = args[2];

     if (!prompt.length)
         return;

     // convert prompt to embedings
     llama_token[] embd_inp;
     embd_inp.length = prompt.length;

     writeln("tokenizing...");

     auto n_of_tok = llama_tokenize(llama_get_model(ctx), 
prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) 
embd_inp.length, add_bos, allow_special);
     embd_inp.length = n_of_tok;

     if (!n_of_tok) {
         writeln("no tokens generated, something gone wrong");
         return;
     }

     writeln("input has ", n_of_tok, " tokens");

     foreach (id; embd_inp) {
         write(llama_token_to_piece(ctx, id));
     }
     writeln();

     // total length of the sequence including the prompt
     const int n_len = 128;

     const int n_ctx = llama_n_ctx(ctx);
     const int n_kv_req = cast(int)(embd_inp.length + (n_len - 
embd_inp.length));

     if (n_kv_req > n_ctx) {
         writeln("error: prompt is too long");
         return;
     }

     writeln("building batch");

     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
     llama_batch batch = llama_batch_init(512, 0, 1);

     // evaluate the initial prompt
     for (size_t i = 0; i < embd_inp.length; i++) {
         // note that seq_pos_id = [0] is required as there should 
be at least one token
         llama_batch_add(batch, embd_inp[i], cast(int) i, [0], 
false);
     }

     // llama_decode will output logits only for the last token of 
the prompt
     batch.logits[batch.n_tokens - 1] = true;

     writeln("decoding batch");

     if (llama_decode(ctx, batch) != 0) {
         writeln("llama_decode() failed");
         return;
     }

     // main loop

     int n_cur    = batch.n_tokens;
     int n_decode = 0;

     const auto t_main_start = ggml_time_us();

     while (n_cur <= n_len) {
         // sample the next token
         {
             auto   n_vocab = llama_n_vocab(model);
             auto   logits  = llama_get_logits_ith(ctx, 
batch.n_tokens - 1);

             llama_token_data[] candidates;
             candidates.length = n_vocab;

             for (llama_token token_id = 0; token_id < n_vocab; 
token_id++) {
                 candidates ~= llama_token_data(token_id, 
logits[token_id], 0.0f);
             }

             llama_token_data_array candidates_p = { 
candidates.ptr, cast(int) candidates.length, false };

             // sample the most likely token
             const llama_token new_token_id = 
llama_sample_token_greedy(ctx, &candidates_p);

             // is it an end of stream?
             if (new_token_id == llama_token_eos(model) || n_cur 
== n_len) {
                 writeln();

                 break;
             }

             writef("%s", llama_token_to_piece(ctx, new_token_id));

             // prepare the next batch
             llama_batch_clear(batch);

             // push this new token for next evaluation
             llama_batch_add(batch, new_token_id, n_cur, [0], 
true);

             n_decode += 1;
         }

         n_cur += 1;

         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
             writefln("%s : failed to eval, return code %d\n", 
__FUNCTION__, 1);
             return;
         }
     }

     const auto t_main_end = ggml_time_us();
     llama_print_timings(ctx);
     writeln();

     // cleanup
     llama_batch_free(batch);
     llama_free(ctx);
     llama_free_model(model);
     llama_backend_free();
}


void llama_batch_add(
                     ref llama_batch batch,
                     llama_token id,
                     llama_pos pos,
                     const llama_seq_id[] seq_ids,
                     bool logits) {
     batch.token   [batch.n_tokens] = id;
     batch.pos     [batch.n_tokens] = pos;
     batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length;
     for (size_t i = 0; i < seq_ids.length; ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
     batch.logits  [batch.n_tokens] = logits;

     batch.n_tokens++;
}

string llama_token_to_piece(llama_context* ctx, llama_token 
token) {
     char[] result;
     result.length = 8;
     const int n_tokens = 
llamad.llama_token_to_piece(llama_get_model(ctx), token, 
result.ptr, cast(int) result.length);
     if (n_tokens < 0) {
         result.length = -n_tokens;
         int check = 
llamad.llama_token_to_piece(llama_get_model(ctx), token, 
result.ptr, cast(int) result.length);
         assert(check == -n_tokens);
     } else {
         result.length = n_tokens;
     }

     return cast(string) result;
}

void llama_batch_clear(ref llama_batch batch) {
     batch.n_tokens = 0;
}

```



Build inside llama.cpp folder with this command (I've been using 
CUDA but it is possible to use without it)

```bat
dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib 
-L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing 
Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib 
cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib 
-L/NODEFAULTLIB:libcmt.lib msvcprtd.lib
```



And **run**

```bat
llama-d 
"E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizar
lm-13b.Q5_K_M.gguf" "How to quit vim?"
```
Mar 21
next sibling parent Andrea Fontana <nospam example.org> writes:
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.
Wow! :)
Mar 21
prev sibling next sibling parent Andrea Fontana <nospam example.org> writes:
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.

 Tested with dmd v2.107 on Windows.
Tested on linux, it works too! ``` <dummy00022>, why is sky blue? The sky is blue because of the way that the atmosphere scatters sunlight. Blue light has more energy than the other colors in the visible spectrum, so it's more likely to penetrate the atmosphere and reach our eyes. This causes the blue light to be scattered in all directions, making the sky appear blue. Additionally, the color of the sky can change depending on the weather and the time of day. For example, the sky may appear more orange or red during sunrise or sunset when the sun is low ``` Andrea
Mar 21
prev sibling next sibling parent reply Serg Gini <kornburn yandex.ru> writes:
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.
Nice catch. Thanks for sharing! Another option is using pure D implementation based on llama2.c code from Karpathy https://github.com/cyrusmsk/llama2.d
Mar 21
parent Martyn <martyn.developer googlemail.com> writes:
On Thursday, 21 March 2024 at 22:42:31 UTC, Serg Gini wrote:
 On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start 
 building your D app powered with llama.cpp, that's it, no 
 extra work needed except making a dummy file.
Nice catch. Thanks for sharing! Another option is using pure D implementation based on llama2.c code from Karpathy https://github.com/cyrusmsk/llama2.d
This is really awesome! Only issue I have with is the title. It should have ended with either:- .. and thats a good thing! or .. lets get into it. You are now a blogger, a journalist, a "youtuber" lol
Mar 22
prev sibling parent Chris Piker <chris hoopjump.com> writes:
On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.

 Tested with dmd v2.107 on Windows.
Can confirm with the NAIF SPICE library. Tested with dmd v2.103 on Linux. Nice!
Mar 26