代码完全按照官方给出的教程运行:https://colab.research.google.com/drive/1tLjOg2ZQuIClfuWrAC2LdiZHCov8oUbs?usp=sharing#scrollTo=ZTTNlWcYSDRl
# Read credentials
import subprocess
import os
# Define the command to source the openrc file and print environment variables
source_command = 'bash -c ". openrc && env"'
# Run the command and capture its output
completed_process = subprocess.run(source_command, shell=True, stdout=subprocess.PIPE, text=True)
# Parse the output to extract environment variables
env_output = completed_process.stdout
env_lines = env_output.splitlines()
env_variables = {}
for line in env_lines:
key, value = line.split('=', 1)
if any([
"OPENAI" in key,
"NEBULA" in key,
"GRAPH" in key,
]):
env_variables[key] = value
os.environ.update(env_variables)
# For Azure OpenAI
import os
import json
import openai
from llama_index.llms import AzureOpenAI
from langchain.embeddings import OpenAIEmbeddings
from llama_index import LangchainEmbedding
from llama_index import (
VectorStoreIndex,
SimpleDirectoryReader,
KnowledgeGraphIndex,
ServiceContext,
)
from llama_index import set_global_service_context
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms import LangChainLLM
from llama_index.core.node_parser.file.simple_file import SimpleFileNodeParser
import logging
import sys
from IPython.display import Markdown, display
logging.basicConfig(
stream=sys.stdout, level=logging.INFO
)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
openai.api_type = "azure"
openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_version = "2023-03-15-preview"
os.environ["OPENAI_API_KEY"] = "INSERT OPENAI KEY"
openai.api_key = os.getenv("OPENAI_API_KEY")
# Instantiate the SimpleFileNodeParser
node_parser = SimpleFileNodeParser.from_defaults()
lc_llm = AzureOpenAI(
engine=os.environ["OPENAI_ENGINE"],
temperature=0,
model="gpt-3.5-turbo",
)
# You need to deploy your own embedding model as well as your own chat completion model
embedding_llm = LangchainEmbedding(
OpenAIEmbeddings(
model="text-embedding-ada-002",
deployment=os.environ["OPENAI_EMBEDDING_ENGINE"],
openai_api_key=openai.api_key,
openai_api_base=openai.api_base,
openai_api_type=openai.api_type,
openai_api_version=openai.api_version,
),
embed_batch_size=1,
)
service_context = ServiceContext.from_defaults(
llm=lc_llm,
embed_model=embedding_llm,
)
# SET Global Service Context
set_global_service_context(service_context)
%load_ext ngql
connection_string = f"--address {os.environ['GRAPHD_HOST']} --port 9669 --user root --password {os.environ['NEBULA_PASSWORD']}"
%ngql {connection_string}
# %ngql USE cy_test2
%ngql CREATE SPACE IF NOT EXISTS rag_workshop(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);
%%ngql
USE rag_workshop;
CREATE TAG IF NOT EXISTS entity(name string);
CREATE EDGE IF NOT EXISTS relationship(relationship string);
from llama_index import download_loader
WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()
documents = loader.load_data(pages=['Guardians of the Galaxy Vol. 3'], auto_suggest=False)
os.environ['NEBULA_USER'] = os.environ["NEBULA_USER"]
os.environ['NEBULA_PASSWORD'] = os.environ["NEBULA_PASSWORD"]
os.environ['NEBULA_ADDRESS'] = os.environ["NEBULA_ADDRESS"]
space_name = "rag_workshop"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]
graph_store = NebulaGraphStore(
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
import nltk
nltk.download('punkt_tab')
kg_index = KnowledgeGraphIndex.from_documents(
documents,
storage_context=storage_context,
service_context=service_context,
max_triplets_per_chunk=3,
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags,
include_embeddings=True,
)
执行到此处报错:
ValidationError Traceback (most recent call last) Cell In[8], line 4 1 import nltk 2 nltk.download(‘punkt_tab’) ----> 4 kg_index = KnowledgeGraphIndex.from_documents( 5 documents, 6 storage_context=storage_context, 7 service_context=service_context, 8 max_triplets_per_chunk=3, 9 space_name=space_name, 10 edge_types=edge_types, 11 rel_prop_names=rel_prop_names, 12 tags=tags, 13 include_embeddings=True, 14 ) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/base.py:102, in BaseIndex.from_documents(cls, documents, storage_context, service_context, show_progress, **kwargs) 97 docstore.set_document_hash(doc.get_doc_id(), doc.hash) 98 nodes = service_context.node_parser.get_nodes_from_documents( 99 documents, show_progress=show_progress 100 ) → 102 return cls( 103 nodes=nodes, 104 storage_context=storage_context, 105 service_context=service_context, 106 show_progress=show_progress, 107 **kwargs, 108 ) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/knowledge_graph/base.py:85, in KnowledgeGraphIndex.init(self, nodes, index_struct, service_context, storage_context, kg_triple_extract_template, max_triplets_per_chunk, include_embeddings, show_progress, max_object_length, kg_triplet_extract_fn, **kwargs) 82 self._max_object_length = max_object_length 83 self._kg_triplet_extract_fn = kg_triplet_extract_fn —> 85 super().init( 86 nodes=nodes, 87 index_struct=index_struct, 88 service_context=service_context, 89 storage_context=storage_context, 90 show_progress=show_progress, 91 **kwargs, 92 ) 94 # TODO: legacy conversion - remove in next release 95 if ( 96 len(self.index_struct.table) > 0 97 and isinstance(self.graph_store, SimpleGraphStore) 98 and len(self.graph_store._data.graph_dict) == 0 99 ): File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/base.py:71, in BaseIndex.init(self, nodes, index_struct, storage_context, service_context, show_progress, **kwargs) 69 if index_struct is None: 70 assert nodes is not None —> 71 index_struct = self.build_index_from_nodes(nodes) 72 self._index_struct = index_struct 73 self._storage_context.index_store.add_index_struct(self._index_struct) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/base.py:171, in BaseIndex.build_index_from_nodes(self, nodes) 169 “”“Build the index from nodes.”“” 170 self._docstore.add_documents(nodes, allow_update=True) → 171 return self._build_index_from_nodes(nodes) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/knowledge_graph/base.py:171, in KnowledgeGraphIndex._build_index_from_nodes(self, nodes) 167 nodes_with_progress = get_tqdm_iterable( 168 nodes, self._show_progress, “Processing nodes” 169 ) 170 for n in nodes_with_progress: → 171 triplets = self._extract_triplets( 172 n.get_content(metadata_mode=MetadataMode.LLM) 173 ) 174 logger.debug(f"> Extracted triplets: {triplets}“) 175 for triplet in triplets: File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/knowledge_graph/base.py:122, in KnowledgeGraphIndex._extract_triplets(self, text) 120 return self._kg_triplet_extract_fn(text) 121 else: → 122 return self._llm_extract_triplets(text) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/indices/knowledge_graph/base.py:126, in KnowledgeGraphIndex._llm_extract_triplets(self, text) 124 def _llm_extract_triplets(self, text: str) → List[Tuple[str, str, str]]: 125 “”“Extract keywords from text.””" → 126 response = self._service_context.llm_predictor.predict( 127 self.kg_triple_extract_template, 128 text=text, 129 ) 130 return self._parse_triplet_response( 131 response, max_length=self._max_object_length 132 ) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/llm_predictor/base.py:115, in LLMPredictor.predict(self, prompt, **prompt_args) 113 messages = prompt.format_messages(llm=self._llm, **prompt_args) 114 messages = self._extend_messages(messages) → 115 chat_response = self._llm.chat(messages) 116 output = chat_response.message.content or “” 117 # NOTE: this is an approximation, only for token counting File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/core/instrumentation/dispatcher.py:321, in Dispatcher.span..wrapper(func, instance, args, kwargs) 318 _logger.debug(f"Failed to reset active_span_id: {e}“) 320 try: → 321 result = func(*args, **kwargs) 322 if isinstance(result, asyncio.Future): 323 # If the result is a Future, wrap it 324 new_future = asyncio.ensure_future(result) File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/llama_index/core/llms/callbacks.py:157, in llm_chat_callback..wrap..wrapped_llm_chat(_self, messages, **kwargs) 154 model_dict = _self.to_dict() 155 model_dict.pop(“api_key”, None) 156 dispatcher.event( → 157 LLMChatStartEvent( 158 model_dict=model_dict, 159 messages=messages, 160 additional_kwargs=kwargs, 161 span_id=span_id, 162 ) 163 ) 164 event_id = callback_manager.on_event_start( 165 CBEventType.LLM, 166 payload={ (…) 170 }, 171 ) 172 try: File ~/.conda/envs/pythonProject/lib/python3.10/site-packages/pydantic/main.py:214, in BaseModel.init(self, **data) 212 #
__tracebackhide__
tells pytest and some other tools to omit this function from tracebacks 213 tracebackhide = True → 214 validated_self = self.pydantic_validator.validate_python(data, self_instance=self) 215 if self is not validated_self: 216 warnings.warn( 217 ‘A custom validator is returning a value other thanself
.\n’ 218 “Returning anything other thanself
from a top level model validator isn’t supported when validating via__init__
.\n” 219 ‘See themodel_validator
docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.’, 220 stacklevel=2, 221 ) ValidationError: 1 validation error for LLMChatStartEvent messages.0 Input should be a valid dictionary or instance of ChatMessage [type=model_type, input_value=ChatMessage(role=<Message…”, additional_kwargs={}), input_type=ChatMessage]