SWIG bindings for llama.cpp into Java/JVM, C#/CLR, etc
LLama.cpp is a popular simplified implementation of LLaMa models in pure C++/C. It already has ctypes-based Python bindings as well Go, Node.js.
This is a just a quick note that I’ve implemented basic SWIG-based wrapper which amongst other can build Java/JVM binding (already implemented) and also C#/CLR, and many other languages. The binding is available at: https://github.com/bnikolic/llama.swig
A basic example demonstrating the wrapper:
/* This follows the example "simple.cpp" */
package examples;
import uk.co.bnikolic.LLamaWrap;
import uk.co.bnikolic.llama_context_params;
import uk.co.bnikolic.gpt_params;
import uk.co.bnikolic.vectorll;
import uk.co.bnikolic.llama_token_data;
import uk.co.bnikolic.llama_token_data_array;
import uk.co.bnikolic.floatArray;
public class Simple {
public static void main(String[] args) throws Exception {
System.out.println(LLamaWrap.llama_print_system_info());
gpt_params params = new gpt_params();
params.setPrompt( "Hello my name is");
params.setModel("models/ggml-vicuna-7b-1.1-q4_0.bin");
LLamaWrap.llama_init_backend();
var ctx = LLamaWrap.llama_init_from_gpt_params( params );
var tokens_list = LLamaWrap.llama_tokenize( ctx , params.getPrompt() , true );
int max_context_size = LLamaWrap.llama_n_ctx( ctx );
int max_tokens_list_size = max_context_size - 4 ;
for( var id : tokens_list )
{
System.out.println( LLamaWrap.llama_token_to_str( ctx , id ) );
}
while ( LLamaWrap.llama_get_kv_cache_token_count( ctx ) < max_context_size )
{
//---------------------------------
// Evaluate the tokens :
//---------------------------------
if ( LLamaWrap.llama_eval( ctx ,
tokens_list.data() ,
tokens_list.size() ,
LLamaWrap.llama_get_kv_cache_token_count( ctx ) ,
params.getN_threads() ) != 0 )
System.out.println("failed");
tokens_list.clear();
//---------------------------------
// Select the best prediction :
//---------------------------------
int new_token_id = 0;
var logits = floatArray.frompointer(LLamaWrap.llama_get_logits( ctx ));
var n_vocab = LLamaWrap.llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
vectorll candidates = new vectorll();
candidates.reserve( n_vocab );
for( int token_id = 0 ; token_id < n_vocab ; token_id++ )
{
llama_token_data d = new llama_token_data();
d.setId(token_id);
d.setLogit(logits.getitem( token_id ));
d.setP(0.0f);
candidates.emplace_back( d);
}
llama_token_data_array candidates_p = new llama_token_data_array();
candidates_p.setData(candidates.data());
candidates_p.setSize(candidates.size());
candidates_p.setSorted(false);
// Select it using the "Greedy sampling" method :
new_token_id = LLamaWrap.llama_sample_token_greedy( ctx , candidates_p );
// is it an end of stream ?
if ( new_token_id == LLamaWrap.llama_token_eos() )
{
System.out.println("end");
}
// Print the new token :
System.out.println(LLamaWrap.llama_token_to_str( ctx , new_token_id ) );
// Push this new token for next evaluation :
tokens_list.push_back( new_token_id );
} // wend of main loop
}
}
LD_LIBRARY_PATH=. java -cp "LLamaWrap.jar:." examples.Simple
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |