digitalmars.D - This one simple trick allows D programmer use llama.cpp, rust

evilrat (197/197) Mar 21 Just like the title says, no need to make bindings anymore, just

Andrea Fontana (2/6) Mar 21 Wow! :)
Andrea Fontana (14/19) Mar 21 Tested on linux, it works too!
Serg Gini (5/9) Mar 21 Nice catch. Thanks for sharing!

Martyn (8/17) Mar 22 This is really awesome!

Chris Piker (4/9) Mar 26 Can confirm with the NAIF SPICE library. Tested with dmd v2.103

evilrat <evilrat666 gmail.com> writes:

Just like the title says, no need to make bindings anymore, just 
make a dummy C file with single include and start building your D 
app powered with llama.cpp, that's it, no extra work needed 
except making a dummy file.

Tested with dmd v2.107 on Windows.



__llamad.c__:

```c
#include "llama.h"
```

A ported D version of `simple` example from llama.cpp

__llamad.d__:

```d
module llama_d;

import std.string;
import std.stdio;

import llamad; // imports llamad.c

// pragma(msg, __traits(allMembers, llamad));

void main(string[] args)
{
     if (args.length < 3) {
         writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> 
<your_prompt>");
         return;
     }

     llama_backend_init();
     llama_numa_init(GGML_NUMA_STRATEGY_DISABLED);

     auto mparams = llama_model_default_params();
     // mparams.n_gpu_layers = 30; // offload layers to the GPU to 
accelerate inference

     auto ctx_params = llama_context_default_params();
     ctx_params.n_ctx = 2048;

     import std.parallelism;
     ctx_params.n_threads = totalCPUs-1;
     ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 
? ctx_params.n_threads : ctx_params.n_threads_batch;

     llama_model*  model = 
llama_load_model_from_file(toStringz(args[1]), mparams);
     llama_context*	ctx = llama_new_context_with_model(model, 
ctx_params);

     const bool add_bos = llama_vocab_type(model) == 
LLAMA_VOCAB_TYPE_SPM;
     const bool allow_special = false;

     string prompt = args[2];

     if (!prompt.length)
         return;

     // convert prompt to embedings
     llama_token[] embd_inp;
     embd_inp.length = prompt.length;

     writeln("tokenizing...");

     auto n_of_tok = llama_tokenize(llama_get_model(ctx), 
prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) 
embd_inp.length, add_bos, allow_special);
     embd_inp.length = n_of_tok;

     if (!n_of_tok) {
         writeln("no tokens generated, something gone wrong");
         return;
     }

     writeln("input has ", n_of_tok, " tokens");

     foreach (id; embd_inp) {
         write(llama_token_to_piece(ctx, id));
     }
     writeln();

     // total length of the sequence including the prompt
     const int n_len = 128;

     const int n_ctx = llama_n_ctx(ctx);
     const int n_kv_req = cast(int)(embd_inp.length + (n_len - 
embd_inp.length));

     if (n_kv_req > n_ctx) {
         writeln("error: prompt is too long");
         return;
     }

     writeln("building batch");

     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
     llama_batch batch = llama_batch_init(512, 0, 1);

     // evaluate the initial prompt
     for (size_t i = 0; i < embd_inp.length; i++) {
         // note that seq_pos_id = [0] is required as there should 
be at least one token
         llama_batch_add(batch, embd_inp[i], cast(int) i, [0], 
false);
     }

     // llama_decode will output logits only for the last token of 
the prompt
     batch.logits[batch.n_tokens - 1] = true;

     writeln("decoding batch");

     if (llama_decode(ctx, batch) != 0) {
         writeln("llama_decode() failed");
         return;
     }

     // main loop

     int n_cur    = batch.n_tokens;
     int n_decode = 0;

     const auto t_main_start = ggml_time_us();

     while (n_cur <= n_len) {
         // sample the next token
         {
             auto   n_vocab = llama_n_vocab(model);
             auto   logits  = llama_get_logits_ith(ctx, 
batch.n_tokens - 1);

             llama_token_data[] candidates;
             candidates.length = n_vocab;

             for (llama_token token_id = 0; token_id < n_vocab; 
token_id++) {
                 candidates ~= llama_token_data(token_id, 
logits[token_id], 0.0f);
             }

             llama_token_data_array candidates_p = { 
candidates.ptr, cast(int) candidates.length, false };

             // sample the most likely token
             const llama_token new_token_id = 
llama_sample_token_greedy(ctx, &candidates_p);

             // is it an end of stream?
             if (new_token_id == llama_token_eos(model) || n_cur 
== n_len) {
                 writeln();

                 break;
             }

             writef("%s", llama_token_to_piece(ctx, new_token_id));

             // prepare the next batch
             llama_batch_clear(batch);

             // push this new token for next evaluation
             llama_batch_add(batch, new_token_id, n_cur, [0], 
true);

             n_decode += 1;
         }

         n_cur += 1;

         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
             writefln("%s : failed to eval, return code %d\n", 
__FUNCTION__, 1);
             return;
         }
     }

     const auto t_main_end = ggml_time_us();
     llama_print_timings(ctx);
     writeln();

     // cleanup
     llama_batch_free(batch);
     llama_free(ctx);
     llama_free_model(model);
     llama_backend_free();
}


void llama_batch_add(
                     ref llama_batch batch,
                     llama_token id,
                     llama_pos pos,
                     const llama_seq_id[] seq_ids,
                     bool logits) {
     batch.token   [batch.n_tokens] = id;
     batch.pos     [batch.n_tokens] = pos;
     batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length;
     for (size_t i = 0; i < seq_ids.length; ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
     batch.logits  [batch.n_tokens] = logits;

     batch.n_tokens++;
}

string llama_token_to_piece(llama_context* ctx, llama_token 
token) {
     char[] result;
     result.length = 8;
     const int n_tokens = 
llamad.llama_token_to_piece(llama_get_model(ctx), token, 
result.ptr, cast(int) result.length);
     if (n_tokens < 0) {
         result.length = -n_tokens;
         int check = 
llamad.llama_token_to_piece(llama_get_model(ctx), token, 
result.ptr, cast(int) result.length);
         assert(check == -n_tokens);
     } else {
         result.length = n_tokens;
     }

     return cast(string) result;
}

void llama_batch_clear(ref llama_batch batch) {
     batch.n_tokens = 0;
}

```



Build inside llama.cpp folder with this command (I've been using 
CUDA but it is possible to use without it)

```bat
dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib 
-L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing 
Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib 
cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib 
-L/NODEFAULTLIB:libcmt.lib msvcprtd.lib
```



And **run**

```bat
llama-d 
"E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizar
lm-13b.Q5_K_M.gguf" "How to quit vim?"
```

Mar 21

Andrea Fontana <nospam example.org> writes:

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.

Wow! :)

Mar 21

Andrea Fontana <nospam example.org> writes:

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.

 Tested with dmd v2.107 on Windows.

Tested on linux, it works too!

```
<dummy00022>, why is sky blue?

The sky is blue because of the way that the atmosphere scatters 
sunlight. Blue light has more energy than the other colors in the 
visible spectrum, so it's more likely to penetrate the atmosphere 
and reach our eyes. This causes the blue light to be scattered in 
all directions, making the sky appear blue.

Additionally, the color of the sky can change depending on the 
weather and the time of day. For example, the sky may appear more 
orange or red during sunrise or sunset when the sun is low

```

Andrea

Mar 21

Serg Gini <kornburn yandex.ru> writes:

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.

Nice catch. Thanks for sharing!
Another option is using pure D implementation based on llama2.c 
code from Karpathy

https://github.com/cyrusmsk/llama2.d

Mar 21

Martyn <martyn.developer googlemail.com> writes:

On Thursday, 21 March 2024 at 22:42:31 UTC, Serg Gini wrote:
 On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start 
 building your D app powered with llama.cpp, that's it, no 
 extra work needed except making a dummy file.

 Nice catch. Thanks for sharing!
 Another option is using pure D implementation based on llama2.c 
 code from Karpathy

 https://github.com/cyrusmsk/llama2.d

This is really awesome!

Only issue I have with is the title. It should have ended with 
either:-

.. and thats a good thing!

    or

.. lets get into it.


You are now a blogger, a journalist, a "youtuber"  lol

Mar 22

Chris Piker <chris hoopjump.com> writes:

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:
 Just like the title says, no need to make bindings anymore, 
 just make a dummy C file with single include and start building 
 your D app powered with llama.cpp, that's it, no extra work 
 needed except making a dummy file.

 Tested with dmd v2.107 on Windows.

Can confirm with the NAIF SPICE library.  Tested with dmd v2.103 
on Linux.

Nice!

Mar 26

D Programming

C/C++ Programming

Other

digitalmars.D - This one simple trick allows D programmer use llama.cpp, rust