admin管理员组文章数量:1405141
I'm looking for way to query Azure AI with vector indexing from Confluence DataCenter Server I'm using the following python code
I'm getting an error
ERROR - Failed to upload documents to Azure Search: 'dict' object has no attribute 'name'
the code failed on vector_store.add_documents(split_docs) I review / /
I understand I need to convert split_docs structure to list, is it the right approach ?
the structure data is as following
First document type: <class 'langchain_core.documents.base.Document'>
First document content: page_content='Be aware: Described steps require manual actions. Already automated actions are removed from below guide. It is also possible that some version skips "Y" part, so then any Y-1 version is X-1.Preparing new Release Candidate (RC,' metadata={'title': 'Managing EBF and RC branches during release - rules and steps for 10.5 line', 'id': '294514485', 'source': '+EBF+and+RC+branches+during+release+-+rules+and+steps+for+10.5+line', 'when': '2024-08-12T05:06:02.270-07:00'}
import os
import logging
from langchain_community.document_loaders import ConfluenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from langchain_core.documents import Document
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("confluence_rag.log"), # Log to a file
logging.StreamHandler() # Also log to console
]
)
logger = logging.getLogger(__name__)
# Configuration
CONFLUENCE_URL = "/" # Base URL of your Confluence Data Center
CONFLUENCE_USERNAME = "XXZ"
CONFLUENCE_TOKEN = "X+X" # Use password or personal access token
CONFLUENCE_SPACE_KEY = "SCAN" # Optional: Specify a space key to limit the scope
AZURE_SEARCH_ENDPOINT = ";
AZURE_SEARCH_INDEX_NAME = "confluence-index"
AZURE_SEARCH_API_KEY = "XXX"
AZURE_OPENAI_ENDPOINT = "/"
AZURE_OPENAI_API_KEY = "XXX"
AZURE_OPENAI_DEPLOYMENT_NAME = "embeddingmodel"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = "embeddingmodel" # Embedding deployment name
# Initialize Azure OpenAI Embeddings
logger.info(f"Initializing Azure OpenAI Embeddings with deployment: {AZURE_OPENAI_EMBEDDING_DEPLOYMENT}")
try:
embeddings = AzureOpenAIEmbeddings(
azure_endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
api_version="2023-05-15",
)
test_embedding = embeddings.embed_query("Test sentence")
logger.info(f"Embedding test successful: {len(test_embedding)} dimensions")
except Exception as e:
logger.error(f"Failed to initialize Azure OpenAI Embeddings: {e}")
raise
# Initialize Azure OpenAI Chat Model
logger.info(f"Initializing Azure OpenAI Chat Model with deployment: {AZURE_OPENAI_DEPLOYMENT_NAME}")
try:
llm = AzureChatOpenAI(
azure_endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME,
api_version="2023-05-15",
temperature=0.7,
)
except Exception as e:
logger.error(f"Failed to initialize Azure OpenAI Chat Model: {e}")
raise
# Step 1: Load Confluence Data
logger.info(f"Loading Confluence data from space: {CONFLUENCE_SPACE_KEY}")
loader = ConfluenceLoader(
url=CONFLUENCE_URL,
token=CONFLUENCE_TOKEN,
cloud=False,
space_key=CONFLUENCE_SPACE_KEY,
limit=100,
)
try:
documents = loader.load()
logger.info(f"Loaded {len(documents)} documents from Confluence")
except Exception as e:
logger.error(f"Failed to load Confluence data: {e}")
raise
# Step 2: Split Documents
logger.info("Splitting documents into chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)
logger.info(f"Split into {len(split_docs)} chunks")
logger.info(f"First split_doc: {split_docs[0].page_content}, Metadata: {split_docs[0].metadata}")
# Test Azure Search Connection
logger.info("Testing Azure Search connection")
try:
credential = AzureKeyCredential(AZURE_SEARCH_API_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
index_client.get_service_statistics()
logger.info("Azure Search connection successful")
except Exception as e:
logger.error(f"Azure Search connection failed: {e}")
raise
# Step 3: Initialize Azure Search Vector Store
logger.info(f"Initializing Azure Search Vector Store with index: {AZURE_SEARCH_INDEX_NAME}")
try:
vector_store = AzureSearch(
azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
azure_search_key=AZURE_SEARCH_API_KEY,
index_name=AZURE_SEARCH_INDEX_NAME,
embedding_function=embeddings.embed_query,
fields=[
{"name": "id", "type": "Edm.String", "key": True},
{"name": "title", "type": "Edm.String", "searchable": True, "filterable": True},
{"name": "content", "type": "Edm.String", "searchable": True},
{
"name": "embedding",
"type": "Collection(Edm.Single)",
"searchable": True,
"vectorSearchDimensions": 1536,
"vectorSearchConfiguration": "my-vector-profile",
},
],
vector_search_configuration={
"vectorSearches": [
{
"profiles": [
{"name": "my-vector-profile", "algorithmConfigurationName": "my-hnsw-config"}
],
"algorithms": [
{"name": "my-hnsw-config", "kind": "hnsw"}
],
}
]
}
)
except Exception as e:
logger.error(f"Failed to initialize Azure Search Vector Store: {e}")
logger.debug(f"Endpoint: {AZURE_SEARCH_ENDPOINT}, Index: {AZURE_SEARCH_INDEX_NAME}")
raise
# Step 4: Add Documents to Azure Search
logger.info("Uploading documents to Azure Search")
logger.info(f"First document type: {type(split_docs[0])}")
logger.info(f"First document content: {split_docs[0]}")
logger.info(f"First loaded document type: {type(documents[0])}")
logger.info(f"First loaded document content: {documents[0]}")
try:
vector_store.add_documents(split_docs)
logger.info(f"Uploaded {len(split_docs)} documents to Azure Search")
except Exception as e:
logger.error(f"Failed to upload documents to Azure Search: {e}")
raise
# Step 5: Set up RetrievalQA Chain
logger.info("Setting up RetrievalQA chain")
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
)
# Step 6: Query the System
def query_confluence(user_query):
logger.info(f"Querying Confluence with: {user_query}")
try:
result = qa_chain({"query": user_query})
logger.info("Query completed successfully")
return result["result"], result["source_documents"]
except Exception as e:
logger.error(f"Query failed: {e}")
raise
# Example Usage
if __name__ == "__main__":
user_query = "Please point out the most important data from IT space?"
logger.debug(f"Starting main execution with query: {user_query}")
answer, sources = query_confluence(user_query)
logger.info("Answer generated:")
print("Answer:", answer)
print("\nSources:")
for doc in sources:
logger.info(f"Source: {doc.metadata['title']} (ID: {doc.metadata['id']})")
print(f"- {doc.metadata['title']} (ID: {doc.metadata['id']})")
I'm looking for way to query Azure AI with vector indexing from Confluence DataCenter Server I'm using the following python code
I'm getting an error
ERROR - Failed to upload documents to Azure Search: 'dict' object has no attribute 'name'
the code failed on vector_store.add_documents(split_docs) I review https://python.langchain/docs/concepts/vectorstores/ https://python.langchain/docs/concepts/text_splitters/
I understand I need to convert split_docs structure to list, is it the right approach ?
the structure data is as following
First document type: <class 'langchain_core.documents.base.Document'>
First document content: page_content='Be aware: Described steps require manual actions. Already automated actions are removed from below guide. It is also possible that some version skips "Y" part, so then any Y-1 version is X-1.Preparing new Release Candidate (RC,' metadata={'title': 'Managing EBF and RC branches during release - rules and steps for 10.5 line', 'id': '294514485', 'source': 'https://confluencewiki.XXX/display/ABC/Managing+EBF+and+RC+branches+during+release+-+rules+and+steps+for+10.5+line', 'when': '2024-08-12T05:06:02.270-07:00'}
import os
import logging
from langchain_community.document_loaders import ConfluenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from langchain_core.documents import Document
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("confluence_rag.log"), # Log to a file
logging.StreamHandler() # Also log to console
]
)
logger = logging.getLogger(__name__)
# Configuration
CONFLUENCE_URL = "https://confluencewiki.XXX/" # Base URL of your Confluence Data Center
CONFLUENCE_USERNAME = "XXZ"
CONFLUENCE_TOKEN = "X+X" # Use password or personal access token
CONFLUENCE_SPACE_KEY = "SCAN" # Optional: Specify a space key to limit the scope
AZURE_SEARCH_ENDPOINT = "https://XXX.search.windows"
AZURE_SEARCH_INDEX_NAME = "confluence-index"
AZURE_SEARCH_API_KEY = "XXX"
AZURE_OPENAI_ENDPOINT = "https://XXX.openai.azure/"
AZURE_OPENAI_API_KEY = "XXX"
AZURE_OPENAI_DEPLOYMENT_NAME = "embeddingmodel"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = "embeddingmodel" # Embedding deployment name
# Initialize Azure OpenAI Embeddings
logger.info(f"Initializing Azure OpenAI Embeddings with deployment: {AZURE_OPENAI_EMBEDDING_DEPLOYMENT}")
try:
embeddings = AzureOpenAIEmbeddings(
azure_endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
api_version="2023-05-15",
)
test_embedding = embeddings.embed_query("Test sentence")
logger.info(f"Embedding test successful: {len(test_embedding)} dimensions")
except Exception as e:
logger.error(f"Failed to initialize Azure OpenAI Embeddings: {e}")
raise
# Initialize Azure OpenAI Chat Model
logger.info(f"Initializing Azure OpenAI Chat Model with deployment: {AZURE_OPENAI_DEPLOYMENT_NAME}")
try:
llm = AzureChatOpenAI(
azure_endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME,
api_version="2023-05-15",
temperature=0.7,
)
except Exception as e:
logger.error(f"Failed to initialize Azure OpenAI Chat Model: {e}")
raise
# Step 1: Load Confluence Data
logger.info(f"Loading Confluence data from space: {CONFLUENCE_SPACE_KEY}")
loader = ConfluenceLoader(
url=CONFLUENCE_URL,
token=CONFLUENCE_TOKEN,
cloud=False,
space_key=CONFLUENCE_SPACE_KEY,
limit=100,
)
try:
documents = loader.load()
logger.info(f"Loaded {len(documents)} documents from Confluence")
except Exception as e:
logger.error(f"Failed to load Confluence data: {e}")
raise
# Step 2: Split Documents
logger.info("Splitting documents into chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)
logger.info(f"Split into {len(split_docs)} chunks")
logger.info(f"First split_doc: {split_docs[0].page_content}, Metadata: {split_docs[0].metadata}")
# Test Azure Search Connection
logger.info("Testing Azure Search connection")
try:
credential = AzureKeyCredential(AZURE_SEARCH_API_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
index_client.get_service_statistics()
logger.info("Azure Search connection successful")
except Exception as e:
logger.error(f"Azure Search connection failed: {e}")
raise
# Step 3: Initialize Azure Search Vector Store
logger.info(f"Initializing Azure Search Vector Store with index: {AZURE_SEARCH_INDEX_NAME}")
try:
vector_store = AzureSearch(
azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
azure_search_key=AZURE_SEARCH_API_KEY,
index_name=AZURE_SEARCH_INDEX_NAME,
embedding_function=embeddings.embed_query,
fields=[
{"name": "id", "type": "Edm.String", "key": True},
{"name": "title", "type": "Edm.String", "searchable": True, "filterable": True},
{"name": "content", "type": "Edm.String", "searchable": True},
{
"name": "embedding",
"type": "Collection(Edm.Single)",
"searchable": True,
"vectorSearchDimensions": 1536,
"vectorSearchConfiguration": "my-vector-profile",
},
],
vector_search_configuration={
"vectorSearches": [
{
"profiles": [
{"name": "my-vector-profile", "algorithmConfigurationName": "my-hnsw-config"}
],
"algorithms": [
{"name": "my-hnsw-config", "kind": "hnsw"}
],
}
]
}
)
except Exception as e:
logger.error(f"Failed to initialize Azure Search Vector Store: {e}")
logger.debug(f"Endpoint: {AZURE_SEARCH_ENDPOINT}, Index: {AZURE_SEARCH_INDEX_NAME}")
raise
# Step 4: Add Documents to Azure Search
logger.info("Uploading documents to Azure Search")
logger.info(f"First document type: {type(split_docs[0])}")
logger.info(f"First document content: {split_docs[0]}")
logger.info(f"First loaded document type: {type(documents[0])}")
logger.info(f"First loaded document content: {documents[0]}")
try:
vector_store.add_documents(split_docs)
logger.info(f"Uploaded {len(split_docs)} documents to Azure Search")
except Exception as e:
logger.error(f"Failed to upload documents to Azure Search: {e}")
raise
# Step 5: Set up RetrievalQA Chain
logger.info("Setting up RetrievalQA chain")
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
)
# Step 6: Query the System
def query_confluence(user_query):
logger.info(f"Querying Confluence with: {user_query}")
try:
result = qa_chain({"query": user_query})
logger.info("Query completed successfully")
return result["result"], result["source_documents"]
except Exception as e:
logger.error(f"Query failed: {e}")
raise
# Example Usage
if __name__ == "__main__":
user_query = "Please point out the most important data from IT space?"
logger.debug(f"Starting main execution with query: {user_query}")
answer, sources = query_confluence(user_query)
logger.info("Answer generated:")
print("Answer:", answer)
print("\nSources:")
for doc in sources:
logger.info(f"Source: {doc.metadata['title']} (ID: {doc.metadata['id']})")
print(f"- {doc.metadata['title']} (ID: {doc.metadata['id']})")
Share
Improve this question
edited Mar 8 at 16:53
shlco
asked Mar 8 at 16:12
shlcoshlco
12 bronze badges
1
- Ensure split_docs is a list of dictionaries with correctly formatted attributes before calling vector_store.add_documents(). – Aslesha Kantamsetti Commented Mar 18 at 15:09
1 Answer
Reset to default 0ERROR - Failed to upload documents to Azure Search: 'dict' object has no attribute 'name'
As
split_docs
is a list ofDocument
objects fromlangchain_core.documents.Document
, butAzureSearch.add_documents()
expects a list of dictionaries where each dictionary represents a document.Each document added to Azure Search must match the field names and structure defined in the index schema. Please refer this Msdoc for better understanding of Indexers in Azure AI Search respectively.
To avoid the error, convert split_docs
to a List of Dictionaries.
Before passing the split_docs
list to add_documents()
, transform each Document
into a dictionary matching the Azure Search schema.
Modify the code before vector_store.add_documents(split_docs)
as below,
format for Azure Search
formatted_docs = [
{
"id": doc.metadata.get("id", str(index)),
"title": doc.metadata.get("title", "Untitled"),
"content": doc.page_content,
"embedding": embeddings.embed_query(doc.page_content),
}
for index, doc in enumerate(split_docs)
]
try:
vector_store.add_documents(formatted_docs)
logger.info(f"Uploaded {len(formatted_docs)} documents to Azure Search")
except Exception as e:
logger.error(f"Failed to upload documents to Azure Search: {e}")
raise
Complete code:
import logging
from langchain_community.vectorstores import AzureSearch
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from azure.core.exceptions import AzureError
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
AZURE_SEARCH_ENDPOINT = "https://your-search-service.search.windows"
AZURE_SEARCH_API_KEY = "your-api-key"
AZURE_SEARCH_INDEX_NAME = "your-index-name"
OPENAI_API_KEY = "your-openai-api-key"
OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model=OPENAI_EMBEDDING_MODEL)
vector_store = AzureSearch(
azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
azure_search_key=AZURE_SEARCH_API_KEY,
index_name=AZURE_SEARCH_INDEX_NAME,
embedding_function=embeddings.embed_query,
fields=[
{"name": "id", "type": "Edm.String", "key": True},
{"name": "title", "type": "Edm.String", "searchable": True, "filterable": True},
{"name": "content", "type": "Edm.String", "searchable": True},
{
"name": "embedding",
"type": "Collection(Edm.Single)",
"searchable": True,
"vectorSearchDimensions": 1536,
"vectorSearchConfiguration": "my-vector-profile",
},
]
)
split_docs = [
Document(page_content="This is the first document.", metadata={"id": "doc1", "title": "First Document"}),
Document(page_content="This is the second document.", metadata={"id": "doc2", "title": "Second Document"})
]
formatted_docs = [
{
"id": doc.metadata.get("id", str(index)),
"title": doc.metadata.get("title", "Untitled"),
"content": doc.page_content,
"embedding": embeddings.embed_query(doc.page_content),
}
for index, doc in enumerate(split_docs)
]
try:
vector_store.add_documents(formatted_docs)
logger.info(f"Uploaded {len(formatted_docs)} documents to Azure Search")
except AzureError as e:
logger.error(f"Azure Search Error: {e}")
except Exception as e:
logger.error(f"Failed to upload documents to Azure Search: {e}")
raise
- And also ensure the index fields match the structure of the documents you are uploading.
本文标签: pythonsome challenge while try to add document to Azure searchStack Overflow
版权声明:本文标题:python - some challenge while try to add document to Azure search - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1744891888a2630831.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论