admin管理员组

文章数量:1405141

I'm looking for way to query Azure AI with vector indexing from Confluence DataCenter Server I'm using the following python code

I'm getting an error

ERROR - Failed to upload documents to Azure Search: 'dict' object has no attribute 'name'

the code failed on vector_store.add_documents(split_docs) I review / /

I understand I need to convert split_docs structure to list, is it the right approach ?

the structure data is as following

First document type: <class 'langchain_core.documents.base.Document'>

First document content: page_content='Be aware: Described steps require manual actions. Already automated actions are removed from below guide. It is also possible that some version skips "Y" part, so then any Y-1 version is X-1.Preparing new Release Candidate (RC,' metadata={'title': 'Managing EBF and RC branches during release - rules and steps for 10.5 line', 'id': '294514485', 'source': '+EBF+and+RC+branches+during+release+-+rules+and+steps+for+10.5+line', 'when': '2024-08-12T05:06:02.270-07:00'}

import os
import logging
from langchain_community.document_loaders import ConfluenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from langchain_core.documents import Document

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("confluence_rag.log"),  # Log to a file
        logging.StreamHandler()  # Also log to console
    ]
)
logger = logging.getLogger(__name__)

# Configuration
CONFLUENCE_URL = "/"  # Base URL of your Confluence Data Center
CONFLUENCE_USERNAME = "XXZ"
CONFLUENCE_TOKEN = "X+X"  # Use password or personal access token
CONFLUENCE_SPACE_KEY = "SCAN"  # Optional: Specify a space key to limit the scope

AZURE_SEARCH_ENDPOINT = ";
AZURE_SEARCH_INDEX_NAME = "confluence-index"
AZURE_SEARCH_API_KEY = "XXX"

AZURE_OPENAI_ENDPOINT = "/"
AZURE_OPENAI_API_KEY = "XXX"
AZURE_OPENAI_DEPLOYMENT_NAME = "embeddingmodel"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = "embeddingmodel"  # Embedding deployment name

# Initialize Azure OpenAI Embeddings
logger.info(f"Initializing Azure OpenAI Embeddings with deployment: {AZURE_OPENAI_EMBEDDING_DEPLOYMENT}")
try:
    embeddings = AzureOpenAIEmbeddings(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        api_version="2023-05-15",
    )
    test_embedding = embeddings.embed_query("Test sentence")
    logger.info(f"Embedding test successful: {len(test_embedding)} dimensions")
except Exception as e:
    logger.error(f"Failed to initialize Azure OpenAI Embeddings: {e}")
    raise

# Initialize Azure OpenAI Chat Model
logger.info(f"Initializing Azure OpenAI Chat Model with deployment: {AZURE_OPENAI_DEPLOYMENT_NAME}")
try:
    llm = AzureChatOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME,
        api_version="2023-05-15",
        temperature=0.7,
    )
except Exception as e:
    logger.error(f"Failed to initialize Azure OpenAI Chat Model: {e}")
    raise

# Step 1: Load Confluence Data
logger.info(f"Loading Confluence data from space: {CONFLUENCE_SPACE_KEY}")
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    token=CONFLUENCE_TOKEN,
    cloud=False,
    space_key=CONFLUENCE_SPACE_KEY,
    limit=100,
)
try:
    documents = loader.load()
    logger.info(f"Loaded {len(documents)} documents from Confluence")
except Exception as e:
    logger.error(f"Failed to load Confluence data: {e}")
    raise


# Step 2: Split Documents
logger.info("Splitting documents into chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)
logger.info(f"Split into {len(split_docs)} chunks")
logger.info(f"First split_doc: {split_docs[0].page_content}, Metadata: {split_docs[0].metadata}")

# Test Azure Search Connection
logger.info("Testing Azure Search connection")
try:
    credential = AzureKeyCredential(AZURE_SEARCH_API_KEY)
    index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
    index_client.get_service_statistics()
    logger.info("Azure Search connection successful")
except Exception as e:
    logger.error(f"Azure Search connection failed: {e}")
    raise

# Step 3: Initialize Azure Search Vector Store
logger.info(f"Initializing Azure Search Vector Store with index: {AZURE_SEARCH_INDEX_NAME}")
try:
    vector_store = AzureSearch(
        azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
        azure_search_key=AZURE_SEARCH_API_KEY,
        index_name=AZURE_SEARCH_INDEX_NAME,
        embedding_function=embeddings.embed_query,
        fields=[
            {"name": "id", "type": "Edm.String", "key": True},
            {"name": "title", "type": "Edm.String", "searchable": True, "filterable": True},
            {"name": "content", "type": "Edm.String", "searchable": True},
            {
                "name": "embedding",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "vectorSearchDimensions": 1536,
                "vectorSearchConfiguration": "my-vector-profile",
            },
        ],
        vector_search_configuration={
            "vectorSearches": [
                {
                    "profiles": [
                        {"name": "my-vector-profile", "algorithmConfigurationName": "my-hnsw-config"}
                    ],
                    "algorithms": [
                        {"name": "my-hnsw-config", "kind": "hnsw"}
                    ],
                }
            ]
        }
    )
except Exception as e:
    logger.error(f"Failed to initialize Azure Search Vector Store: {e}")
    logger.debug(f"Endpoint: {AZURE_SEARCH_ENDPOINT}, Index: {AZURE_SEARCH_INDEX_NAME}")
    raise

# Step 4: Add Documents to Azure Search
logger.info("Uploading documents to Azure Search")
logger.info(f"First document type: {type(split_docs[0])}")
logger.info(f"First document content: {split_docs[0]}")
logger.info(f"First loaded document type: {type(documents[0])}")
logger.info(f"First loaded document content: {documents[0]}")


try:
    vector_store.add_documents(split_docs)
    logger.info(f"Uploaded {len(split_docs)} documents to Azure Search")
except Exception as e:
    logger.error(f"Failed to upload documents to Azure Search: {e}")
    raise

# Step 5: Set up RetrievalQA Chain
logger.info("Setting up RetrievalQA chain")
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
)

# Step 6: Query the System
def query_confluence(user_query):
    logger.info(f"Querying Confluence with: {user_query}")
    try:
        result = qa_chain({"query": user_query})
        logger.info("Query completed successfully")
        return result["result"], result["source_documents"]
    except Exception as e:
        logger.error(f"Query failed: {e}")
        raise

# Example Usage
if __name__ == "__main__":
    user_query = "Please point out the most important data from IT space?"
    logger.debug(f"Starting main execution with query: {user_query}")
    answer, sources = query_confluence(user_query)
    logger.info("Answer generated:")
    print("Answer:", answer)
    print("\nSources:")
    for doc in sources:
        logger.info(f"Source: {doc.metadata['title']} (ID: {doc.metadata['id']})")
        print(f"- {doc.metadata['title']} (ID: {doc.metadata['id']})")

I'm looking for way to query Azure AI with vector indexing from Confluence DataCenter Server I'm using the following python code

I'm getting an error

ERROR - Failed to upload documents to Azure Search: 'dict' object has no attribute 'name'

the code failed on vector_store.add_documents(split_docs) I review https://python.langchain/docs/concepts/vectorstores/ https://python.langchain/docs/concepts/text_splitters/

I understand I need to convert split_docs structure to list, is it the right approach ?

the structure data is as following

First document type: <class 'langchain_core.documents.base.Document'>

First document content: page_content='Be aware: Described steps require manual actions. Already automated actions are removed from below guide. It is also possible that some version skips "Y" part, so then any Y-1 version is X-1.Preparing new Release Candidate (RC,' metadata={'title': 'Managing EBF and RC branches during release - rules and steps for 10.5 line', 'id': '294514485', 'source': 'https://confluencewiki.XXX/display/ABC/Managing+EBF+and+RC+branches+during+release+-+rules+and+steps+for+10.5+line', 'when': '2024-08-12T05:06:02.270-07:00'}

import os
import logging
from langchain_community.document_loaders import ConfluenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from langchain_core.documents import Document

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("confluence_rag.log"),  # Log to a file
        logging.StreamHandler()  # Also log to console
    ]
)
logger = logging.getLogger(__name__)

# Configuration
CONFLUENCE_URL = "https://confluencewiki.XXX/"  # Base URL of your Confluence Data Center
CONFLUENCE_USERNAME = "XXZ"
CONFLUENCE_TOKEN = "X+X"  # Use password or personal access token
CONFLUENCE_SPACE_KEY = "SCAN"  # Optional: Specify a space key to limit the scope

AZURE_SEARCH_ENDPOINT = "https://XXX.search.windows"
AZURE_SEARCH_INDEX_NAME = "confluence-index"
AZURE_SEARCH_API_KEY = "XXX"

AZURE_OPENAI_ENDPOINT = "https://XXX.openai.azure/"
AZURE_OPENAI_API_KEY = "XXX"
AZURE_OPENAI_DEPLOYMENT_NAME = "embeddingmodel"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = "embeddingmodel"  # Embedding deployment name

# Initialize Azure OpenAI Embeddings
logger.info(f"Initializing Azure OpenAI Embeddings with deployment: {AZURE_OPENAI_EMBEDDING_DEPLOYMENT}")
try:
    embeddings = AzureOpenAIEmbeddings(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        api_version="2023-05-15",
    )
    test_embedding = embeddings.embed_query("Test sentence")
    logger.info(f"Embedding test successful: {len(test_embedding)} dimensions")
except Exception as e:
    logger.error(f"Failed to initialize Azure OpenAI Embeddings: {e}")
    raise

# Initialize Azure OpenAI Chat Model
logger.info(f"Initializing Azure OpenAI Chat Model with deployment: {AZURE_OPENAI_DEPLOYMENT_NAME}")
try:
    llm = AzureChatOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME,
        api_version="2023-05-15",
        temperature=0.7,
    )
except Exception as e:
    logger.error(f"Failed to initialize Azure OpenAI Chat Model: {e}")
    raise

# Step 1: Load Confluence Data
logger.info(f"Loading Confluence data from space: {CONFLUENCE_SPACE_KEY}")
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    token=CONFLUENCE_TOKEN,
    cloud=False,
    space_key=CONFLUENCE_SPACE_KEY,
    limit=100,
)
try:
    documents = loader.load()
    logger.info(f"Loaded {len(documents)} documents from Confluence")
except Exception as e:
    logger.error(f"Failed to load Confluence data: {e}")
    raise


# Step 2: Split Documents
logger.info("Splitting documents into chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)
logger.info(f"Split into {len(split_docs)} chunks")
logger.info(f"First split_doc: {split_docs[0].page_content}, Metadata: {split_docs[0].metadata}")

# Test Azure Search Connection
logger.info("Testing Azure Search connection")
try:
    credential = AzureKeyCredential(AZURE_SEARCH_API_KEY)
    index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
    index_client.get_service_statistics()
    logger.info("Azure Search connection successful")
except Exception as e:
    logger.error(f"Azure Search connection failed: {e}")
    raise

# Step 3: Initialize Azure Search Vector Store
logger.info(f"Initializing Azure Search Vector Store with index: {AZURE_SEARCH_INDEX_NAME}")
try:
    vector_store = AzureSearch(
        azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
        azure_search_key=AZURE_SEARCH_API_KEY,
        index_name=AZURE_SEARCH_INDEX_NAME,
        embedding_function=embeddings.embed_query,
        fields=[
            {"name": "id", "type": "Edm.String", "key": True},
            {"name": "title", "type": "Edm.String", "searchable": True, "filterable": True},
            {"name": "content", "type": "Edm.String", "searchable": True},
            {
                "name": "embedding",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "vectorSearchDimensions": 1536,
                "vectorSearchConfiguration": "my-vector-profile",
            },
        ],
        vector_search_configuration={
            "vectorSearches": [
                {
                    "profiles": [
                        {"name": "my-vector-profile", "algorithmConfigurationName": "my-hnsw-config"}
                    ],
                    "algorithms": [
                        {"name": "my-hnsw-config", "kind": "hnsw"}
                    ],
                }
            ]
        }
    )
except Exception as e:
    logger.error(f"Failed to initialize Azure Search Vector Store: {e}")
    logger.debug(f"Endpoint: {AZURE_SEARCH_ENDPOINT}, Index: {AZURE_SEARCH_INDEX_NAME}")
    raise

# Step 4: Add Documents to Azure Search
logger.info("Uploading documents to Azure Search")
logger.info(f"First document type: {type(split_docs[0])}")
logger.info(f"First document content: {split_docs[0]}")
logger.info(f"First loaded document type: {type(documents[0])}")
logger.info(f"First loaded document content: {documents[0]}")


try:
    vector_store.add_documents(split_docs)
    logger.info(f"Uploaded {len(split_docs)} documents to Azure Search")
except Exception as e:
    logger.error(f"Failed to upload documents to Azure Search: {e}")
    raise

# Step 5: Set up RetrievalQA Chain
logger.info("Setting up RetrievalQA chain")
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
)

# Step 6: Query the System
def query_confluence(user_query):
    logger.info(f"Querying Confluence with: {user_query}")
    try:
        result = qa_chain({"query": user_query})
        logger.info("Query completed successfully")
        return result["result"], result["source_documents"]
    except Exception as e:
        logger.error(f"Query failed: {e}")
        raise

# Example Usage
if __name__ == "__main__":
    user_query = "Please point out the most important data from IT space?"
    logger.debug(f"Starting main execution with query: {user_query}")
    answer, sources = query_confluence(user_query)
    logger.info("Answer generated:")
    print("Answer:", answer)
    print("\nSources:")
    for doc in sources:
        logger.info(f"Source: {doc.metadata['title']} (ID: {doc.metadata['id']})")
        print(f"- {doc.metadata['title']} (ID: {doc.metadata['id']})")
Share Improve this question edited Mar 8 at 16:53 shlco asked Mar 8 at 16:12 shlcoshlco 12 bronze badges 1
  • Ensure split_docs is a list of dictionaries with correctly formatted attributes before calling vector_store.add_documents(). – Aslesha Kantamsetti Commented Mar 18 at 15:09
Add a comment  | 

1 Answer 1

Reset to default 0

ERROR - Failed to upload documents to Azure Search: 'dict' object has no attribute 'name'

  • As split_docs is a list of Document objects from langchain_core.documents.Document, but AzureSearch.add_documents() expects a list of dictionaries where each dictionary represents a document.

  • Each document added to Azure Search must match the field names and structure defined in the index schema. Please refer this Msdoc for better understanding of Indexers in Azure AI Search respectively.

To avoid the error, convert split_docs to a List of Dictionaries.

Before passing the split_docs list to add_documents(), transform each Document into a dictionary matching the Azure Search schema.

Modify the code before vector_store.add_documents(split_docs) as below,

format for Azure Search
formatted_docs = [
    {
        "id": doc.metadata.get("id", str(index)),  
        "title": doc.metadata.get("title", "Untitled"),
        "content": doc.page_content,
        "embedding": embeddings.embed_query(doc.page_content), 
    }
    for index, doc in enumerate(split_docs)
]

try:
    vector_store.add_documents(formatted_docs)
    logger.info(f"Uploaded {len(formatted_docs)} documents to Azure Search")
except Exception as e:
    logger.error(f"Failed to upload documents to Azure Search: {e}")
    raise

Complete code:

import logging
from langchain_community.vectorstores import AzureSearch
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from azure.core.exceptions import AzureError

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

AZURE_SEARCH_ENDPOINT = "https://your-search-service.search.windows"
AZURE_SEARCH_API_KEY = "your-api-key"
AZURE_SEARCH_INDEX_NAME = "your-index-name"

OPENAI_API_KEY = "your-openai-api-key"
OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model=OPENAI_EMBEDDING_MODEL)

vector_store = AzureSearch(
    azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
    azure_search_key=AZURE_SEARCH_API_KEY,
    index_name=AZURE_SEARCH_INDEX_NAME,
    embedding_function=embeddings.embed_query,
    fields=[
        {"name": "id", "type": "Edm.String", "key": True},
        {"name": "title", "type": "Edm.String", "searchable": True, "filterable": True},
        {"name": "content", "type": "Edm.String", "searchable": True},
        {
            "name": "embedding",
            "type": "Collection(Edm.Single)",
            "searchable": True,
            "vectorSearchDimensions": 1536,
            "vectorSearchConfiguration": "my-vector-profile",
        },
    ]
)

split_docs = [
    Document(page_content="This is the first document.", metadata={"id": "doc1", "title": "First Document"}),
    Document(page_content="This is the second document.", metadata={"id": "doc2", "title": "Second Document"})
]

formatted_docs = [
    {
        "id": doc.metadata.get("id", str(index)),  
        "title": doc.metadata.get("title", "Untitled"),
        "content": doc.page_content,
        "embedding": embeddings.embed_query(doc.page_content), 
    }
    for index, doc in enumerate(split_docs)
]


try:
    vector_store.add_documents(formatted_docs)
    logger.info(f"Uploaded {len(formatted_docs)} documents to Azure Search")
except AzureError as e:
    logger.error(f"Azure Search Error: {e}")
except Exception as e:
    logger.error(f"Failed to upload documents to Azure Search: {e}")
    raise
  • And also ensure the index fields match the structure of the documents you are uploading.

本文标签: pythonsome challenge while try to add document to Azure searchStack Overflow