admin管理员组文章数量:1242774
I tried to extract vendor and product details from an email into an Excel sheet, including fields like **Mail_Date, Mail_Subject, Product_Name, Product_Quantity, Product_Price, Vendor_Name, Vendor_Email, Vendor_Phone, Vendor_Address, Vendor_GST_No, and Vendor_Website. **
However, I'm facing issues with the regular expressions. When the data is in a consistent structure, I can get partial output, but if the data is unstructured, the results are completely off, such as showing random values like "aaa," "676776," or "0000" etc...
How can I resolve this issue?
I have also tried using ML methods with Spacy, but they yield incorrect outputs. Any suggestions?
import re
import yaml
import imaplib
import spacy
import pandas as pd
from email import message_from_bytes
from bs4 import BeautifulSoup
from dateutil import parser
nlp = spacy.load("en_core_web_sm")
class ProcurementEmailParser:
def __init__(self, credentials_path):
self.credentials = self.load_credentials(credentials_path)
self.mail = None
# Not include Vendor_Address, Vendor_GST_No
self.output_columns = [
'Mail_Date', 'Mail_Subject', 'Product_Name',
'Product_Quantity', 'Product_Price',
'Vendor_Name', 'Vendor_Email', 'Vendor_Phone', 'Vendor_Website'
]
def load_credentials(self, path):
with open(path) as f:
return yaml.safe_load(f)
def connect_email(self):
self.mail = imaplib.IMAP4_SSL('imap.gmail')
self.mail.login(self.credentials['email'],
self.credentials['password'])
self.mail.select('inbox')
def extract_vendor_info(self, text):
vendor_info = {
'Vendor_Name': '',
'Vendor_Email': '',
'Vendor_Phone': '',
'Vendor_Website': ''
}
# Extract email
email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\. [A-Z|a-z]{2,}\b', text)
if email_match:
vendor_info['Vendor_Email'] = email_match.group(0)
# Extract phone
phone_match = re.search(r'(?:\+?91[\s-]?)?[6-9]\d{9}', text)
if phone_match:
vendor_info['Vendor_Phone'] =phone_match.group(0).replace
(' ', '')
# Extract website
website_match = re.search(r'(?:https?://)?(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}', text)
if website_match:
vendor_info['Vendor_Website'] = website_match.group(0)
# Extract vendor name from signature
if 'Regards,' in text:
signature_block = text.split('Regards,')[-1].strip()
lines = [line.strip() for line in signature_block.split('\n') if line.strip()]
if lines:
vendor_info['Vendor_Name'] = re.sub(r'[^a-zA-Z\s]', '', lines[0]).strip()
return vendor_info
def safe_float_conversion(self, value):
try:
return float(value.replace(',', '')) if value else None
except:
return None
def extract_products(self, text):
products = []
# Pattern 1: Table format with price
table_pattern = r'(\d+)\s+(.+?)\s+(\d+)\s+([\d,]+)\s+([\d,]+)'
for match in re.finditer(table_pattern, text):
products.append({
'Product_Name': match.group(2).strip(),
'Product_Quantity': int(match.group(3)),
'Product_Price': self.safe_float_conversion(match.group(5))})
# Pattern 2: Line items with optional price
line_pattern = r'(.+?)\s+[-–]\s+(\d+)\s*(?:nos|units|qty)\s*[-–]?\s*([₹$]?[\d,]+)?'
for match in re.finditer(line_pattern, text, re.IGNORECASE):
product_name = match.group(1).strip()
quantity = int(match.group(2))
price = self.safe_float_conversion(match.group(3)) if match.group(3) else None
products.append({
'Product_Name': product_name,
'Product_Quantity': quantity,
'Product_Price': price
})
# Pattern 3: NLP-based extraction
if not products:
doc = nlp(text)
current_product = {'name': '', 'qty': None, 'price': None}
for ent in doc.ents:
if ent.label_ == 'PRODUCT':
current_product['name'] = ent.text
elif ent.label_ == 'QUANTITY' and 'nos' in ent.text.lower():
current_product['qty'] = int(re.search(r'\d+', ent.text).group())
elif ent.label_ == 'MONEY':
current_product['price'] = self.safe_float_conversion(ent.text)
if current_product['name'] and current_product['qty']:
products.append({
'Product_Name': current_product['name'],
'Product_Quantity': current_product['qty'],
'Product_Price': current_product['price']
})
current_product = {'name': '', 'qty': None, 'price': None}
return products
def process_email(self, email_msg):
try:
# Extract email content
text_content = ''
for part in email_msg.walk():
if part.get_content_type() == 'text/plain':
text_content += part.get_payload(decode=True).decode('utf-8', 'ignore')
elif part.get_content_type() == 'text/html':
html_content = part.get_payload(decode=True).decode('utf-8', 'ignore')
soup = BeautifulSoup(html_content, 'html.parser')
text_content += '\n' + soup.get_text(separator=' ', strip=True)
# Clean text
text_content = re.sub(r'\s+', ' ', text_content).strip()
# Extract vendor info
vendor_info = self.extract_vendor_info(text_content)
# Extract products
products = self.extract_products(text_content)
# Create records
records = []
for product in products:
record = {
'Mail_Date': parser.parse(email_msg['Date']).strftime('%Y-%m-%d %H:%M:%S'),
'Mail_Subject': email_msg.get('Subject',
'No Subject'), **product, **vendor_info } records.append(record)
return records
except Exception as e:
print(f"Error processing email: {str(e)}")
return []
def process_emails(self, limit=50, save_path='output.xlsx'):
self.connect_email()
_, msg_ids = self.mail.search(None, 'ALL')
all_data = []
for msg_id in msg_ids[0].split()[-limit:]:
try:
_, msg_data = self.mail.fetch(msg_id, '(RFC822)')
email_msg = message_from_bytes(msg_data[0][1])
all_data.extend(self.process_email(email_msg))
except Exception as e:
print(f"Error processing email {msg_id.decode()}: {str(e)}")
continue
# Save to Excel
df = pd.DataFrame(all_data, columns=self.output_columns)
df.to_excel(save_path, index=False)
return df
if __name__ == "__main__":
parser = ProcurementEmailParser("C:\\Users\\one\\credentials.yml")
df = parser.process_emails(limit=50, save_path="C:\\Users\\one\\vp_details.xlsx")
print(f"Successfully processed {len(df)} records")
本文标签: pythonVendor and products details extract from GmailStack Overflow
版权声明:本文标题:python - Vendor and products details extract from Gmail - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1740160528a2234343.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论