Langchain 上多个 JSON 文件的查询机器人

Jun*_*ari 5 python langchain large-language-model chromadb jsonloader

我有大约 30 GB 的 JSON 数据和多个文件,想要在此基础上构建查询机器人。我已经用文本文件构建了相同的内容,但我不确定它如何适用于 JSON 数据。

我已经探索过 JSONLoader,但不知道如何使用它将 JSON 数据转换为向量并将其存储到 ChromaDB 中,以便我可以查询它们。 https://python.langchain.com/docs/modules/data_connection/document_loaders/json

示例 JSON 文件:http://jsonblob.com/1147948130921996288

文本数据代码:

# Loading and Splitting the Documents
from langchain.document_loaders import DirectoryLoader

directory = '/content/drive/MyDrive/Data Science/LLM/docs/text files'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)


from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=1000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

# Embedding Text Using Langchain
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

#Creating Vector Store with Chroma DB
from langchain.vectorstores import Chroma
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_db"

vectordb = Chroma.from_documents(
    documents=docs, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()

#Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"

from langchain.chat_models import ChatOpenAI
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

#Extracting Answers from Documents
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)

query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer =  chain.run(input_documents=matching_docs, question=query)
answer
Run Code Online (Sandbox Code Playgroud)

我对 JSON 数据尝试过的操作:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader
import json

# Define a simple JSON schema (modify as needed)
json_schema = {
    
}

# Function to validate a JSON document against a schema
def validate_json(json_data, schema):
    return all(key in json_data for key in schema.keys())

# 1. Load JSON Files
def load_json_docs(directory):
    loader = DirectoryLoader(directory, glob='**/*.json', loader_cls=JSONLoader)
    documents = loader.load()
    
    # Manually filter and validate documents based on the JSON schema
    valid_documents = []
    for doc in documents:
        try:
            # Parse the JSON content
            json_data = json.loads(doc.page_content)
            if validate_json(json_data, json_schema):
                valid_documents.append(doc)
        except json.JSONDecodeError:
            pass  # Invalid JSON format, skip this document
    
    return valid_documents

directory = '/content/drive/MyDrive/Data Science/LLM/docs/json files'
json_documents = load_json_docs(directory)
len(json_documents)

# 2. Split JSON Documents
def split_json_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

split_json_documents = split_json_docs(json_documents)
print(len(split_json_documents))

# 3. Embedding Text Using Langchain
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# 4. Creating Vector Store with Chroma DB
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_json_db"

vectordb = Chroma.from_documents(
    documents=split_json_documents, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()


# 5. Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"

model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

# 6. Extracting Answers from Documents
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)

query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
Run Code Online (Sandbox Code Playgroud)