In [1]:
!pip install --q unstructured langchain

In [2]:
!pip install "unstructured[all-docs]"



In [3]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [5]:
local_path = "WEF_The_Global_Cooperation_Barometer_2024.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

[nltk_data] Downloading package punkt to /home/uma/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/uma/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [6]:
# Preview first page
data[0].page_content



In [7]:
!ollama pull nomic-embed-text


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B                         
verifying sha256 digest ⠋ [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B         

In [8]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED       
llama2:latest          	78e26419b446	3.8 GB	4 weeks ago   	
mistral:latest         	61e88e884507	4.1 GB	4 weeks ago   	
nomic-embed-text:latest	0a109f422b47	274 MB	23 seconds ago	


In [9]:
!pip install --q chromadb
!pip install --q langchain-text-splitters

In [10]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [11]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [12]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|█████████████████████████████████████████████████████████| 11/11 [01:31<00:00,  8.35s/it]


In [13]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [14]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [15]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [16]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [17]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
chain.invoke(input(""))

  what is this about?


OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.07it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.97it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.28it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.64it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.64it/s]


' This passage is from The Global Cooperation Barometer 2024 report, which evaluates the impact of global cooperation in various areas such as health, economy, climate and environment, technology, and peace and security. In this section, the focus is on health cooperation and its trends over the years, with a particular emphasis on how cooperation changed during the COVID-19 pandemic. The passage describes the improvement in health outcomes prior to 2020, the surge in cooperation during the pandemic, and the challenges that remain post-pandemic. It also discusses some key areas where redoubling efforts on global health cooperation is essential, including research collaborations, addressing synthetic drugs, improving mental health support, and engaging aging populations.'

In [19]:
chain.invoke("What are the 5 pillars of global cooperation?")

OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.60it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.73it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.11it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.78it/s]
OllamaEmbeddings: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.38it/s]


' The five pillars of global cooperation as measured by the Global Cooperation Barometer are trade and capital flows, innovation and technology, climate and natural capital, health and wellness, and peace and security. Each pillar examines evidence of cooperative actions and outcomes of cooperative action to determine an overall level of global cooperation in that area.'