Source code for xrag.index.index

from llama_index.core import VectorStoreIndex
from llama_index.core import (
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore

from ..data.qa_loader import get_documents
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser
from llama_index.core.node_parser import HierarchicalNodeParser

[docs] def get_index(documents, persist_dir, split_type="sentence", chunk_size=1024): hierarchical_storage_context = None if not os.path.exists(persist_dir): # load the documents and create the index if split_type == "sentence": parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20) nodes = parser.get_nodes_from_documents(documents, show_progress=True) print("nodes: " + str(nodes.__len__())) index = VectorStoreIndex(nodes,show_progress=True) elif split_type == "character": parser = LangchainNodeParser(RecursiveCharacterTextSplitter()) nodes = parser.get_nodes_from_documents(documents, show_progress=True) print("nodes: " + str(nodes.__len__())) index = VectorStoreIndex(nodes,show_progress=True) elif split_type == "hierarchical": parser = HierarchicalNodeParser.from_defaults( chunk_sizes=[2048, 512, 128] ) nodes = parser.get_nodes_from_documents(documents, show_progress=True) print("nodes: " + str(nodes.__len__())) index = VectorStoreIndex(nodes,show_progress=True) else: raise ValueError(f"split_type {split_type} not supported.") # store it for later if split_type == "hierarchical": docstore = SimpleDocumentStore() docstore.add_documents(nodes) hierarchical_storage_context = StorageContext.from_defaults(docstore=docstore) # save hierarchical_storage_context.persist(persist_dir=persist_dir+"-hierarchical") index.storage_context.persist(persist_dir=persist_dir) else: # load the existing index if split_type == "hierarchical": hierarchical_storage_context = StorageContext.from_defaults(persist_dir=persist_dir + "-hierarchical") storage_context = StorageContext.from_defaults(persist_dir=persist_dir) index = load_index_from_storage(storage_context) return index, hierarchical_storage_context