%%bash
# https://github.com/run-llama/llama_index
pip install -q llama-index
# https://github.com/UKPLab/sentence-transformers
pip install -q sentence-transformers


# GPU
#!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
# CPU
!CMAKE_ARGS="-DLLAMA_CUBLAS=off" pip install llama-cpp-python


# https://huggingface.co/TheBloke/Vigogne-2-7B-Chat-GGUF
LLM_model_name = "https://huggingface.co/TheBloke/Vigogne-2-7B-Chat-GGUF/resolve/main/vigogne-2-7b-chat.Q5_K_M.gguf"

# Name or path to sentence-transformers embedding model.
#  - Multilingual: paraphrase-multilingual-mpnet-base-v2, paraphrase-multilingual-MiniLM-L12-v2
#  - French: dangvantuan/sentence-camembert-base, dangvantuan/sentence-camembert-large
embedding_model_name = 'dangvantuan/sentence-camembert-base'


from llama_index import SimpleDirectoryReader

print("Chargement des documents 'Info1.txt' & 'Info2.txt'")
documents = SimpleDirectoryReader(
    input_files=[
        '/content/drive/MyDrive/Colab Notebooks/LlamaIndex-Vigogne QA/Data/Info1.txt',
        '/content/drive/MyDrive/Colab Notebooks/LlamaIndex-Vigogne QA/Data/Info2.txt',
    ]
).load_data()

2023-11-06 14:25:17 INFO     Chargement des documents 'Info1.txt' & 'Info2.txt'


from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding

print(f"Chargement du modéle Embedding: {embedding_model_name} ...")

embedding_model = LangchainEmbedding(HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    encode_kwargs = {"normalize_embeddings": False}
  )
)


from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

print(f"Chargement du modéle LLM: {LLM_model_name} ...")

llm = LlamaCPP(
    # You can pass in the URL to a GGML/GGUF model to download it automatically
    model_url=LLM_model_name,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=1024,
    generate_kwargs={},
    model_kwargs={
        "low_cpu_mem_usage": True,
    },
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

2023-11-06 14:20:41 INFO     Chargement du modéle LLM: https://huggingface.co/TheBloke/Vigogne-2-7B-Chat-GGUF/resolve/main/vigogne-2-7b-chat.Q5_K_M.gguf ...

Downloading url https://huggingface.co/TheBloke/Vigogne-2-7B-Chat-GGUF/resolve/main/vigogne-2-7b-chat.Q5_K_M.gguf to path /tmp/llama_index/models/vigogne-2-7b-chat.Q5_K_M.gguf
total size (MB): 4783.16

4562it [02:03, 36.91it/s]
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 |


from llama_index import (
    VectorStoreIndex,
    ServiceContext
)

print("Storing chunks...")

# https://docs.llamaindex.ai/en/stable/api_reference/indices/vector_store.html
vectorstore_index = VectorStoreIndex.from_documents(
    documents = documents,
    service_context = ServiceContext.from_defaults(
      llm=None,
      embed_model=embedding_model,
      chunk_size=500,
    ),
    show_progress=True
)

2023-11-06 14:29:07 INFO     Storing chunks...
[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.

LLM is explicitly disabled. Using MockLLM.

Parsing documents into nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


from llama_index import (
    VectorStoreIndex,
    ServiceContext
)

print("Storing chunks...")

# https://docs.llamaindex.ai/en/stable/api_reference/indices/vector_store.html
vectorstore_index = VectorStoreIndex.from_documents(
    documents = documents,
    service_context = ServiceContext.from_defaults(
      llm=None,
      embed_model=embedding_model,
      chunk_size=500,
    ),
    show_progress=True
)
vectorstore_index.storage_context.persist(persist_dir='llama_index')


from llama_index.prompts import PromptTemplate
from llama_index import ServiceContext

text_qa_template_str = (
  "<|system|>: Vous êtes un assistant IA qui répond à la question posée à la fin en utilisant le contexte suivant. Si vous ne connaissez pas la réponse, dites simplement que vous ne savez pas, n'essayez pas d'inventer une réponse. Veuillez répondre exclusivement en français.\n"
  "<|user|>: {context_str}\n"
  "Question: {query_str}\n"
  "<|assistant|>:"
)

text_qa_template = PromptTemplate(text_qa_template_str)

query_engine = vectorstore_index.as_query_engine(
    text_qa_template=text_qa_template,
    service_context=ServiceContext.from_defaults(
      llm=llm,
      embed_model=embedding_model,
      chunk_size=500,
    ),
)


from IPython.display import Markdown

response = query_engine.query("Qui est l’auteur de TutoPlot ? Quelle est sa couleur préférée ?")
print("Question: Qui est l’auteur de TutoPlot ? Quelle est sa couleur préférée ?")
display(Markdown(f"Reponse: <i>{response}</i>"))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Qui est l’auteur de TutoPlot ? Quelle est sa couleur préférée ?


response = query_engine.query("Pourquoi JupyterGoCrash est-il obsolète ?")
print("Question: Pourquoi JupyterGoCrash est-il obsolète ?")
display(Markdown(f"Reponse: <i>{response}</i>"))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

Question: Pourquoi JupyterGoCrash est-il obsolète ?