admin管理员组

文章数量:1242774

I tried to extract vendor and product details from an email into an Excel sheet, including fields like **Mail_Date, Mail_Subject, Product_Name, Product_Quantity, Product_Price, Vendor_Name, Vendor_Email, Vendor_Phone, Vendor_Address, Vendor_GST_No, and Vendor_Website. **

However, I'm facing issues with the regular expressions. When the data is in a consistent structure, I can get partial output, but if the data is unstructured, the results are completely off, such as showing random values like "aaa," "676776," or "0000" etc...

How can I resolve this issue?

I have also tried using ML methods with Spacy, but they yield incorrect outputs. Any suggestions?

import re
import yaml
import imaplib
import spacy
import pandas as pd
from email import message_from_bytes
from bs4 import BeautifulSoup
from dateutil import parser

nlp = spacy.load("en_core_web_sm")

class ProcurementEmailParser:
    def __init__(self, credentials_path):
        self.credentials = self.load_credentials(credentials_path)
        self.mail = None
        # Not include Vendor_Address, Vendor_GST_No
        self.output_columns = [
        'Mail_Date', 'Mail_Subject', 'Product_Name', 
        'Product_Quantity', 'Product_Price',
        'Vendor_Name', 'Vendor_Email', 'Vendor_Phone', 'Vendor_Website'
    ]
    
    def load_credentials(self, path):
        with open(path) as f:
            return yaml.safe_load(f)
    
    def connect_email(self):
        self.mail = imaplib.IMAP4_SSL('imap.gmail')
        self.mail.login(self.credentials['email'], 
        self.credentials['password'])
        self.mail.select('inbox')
    
    def extract_vendor_info(self, text):
        vendor_info = {
            'Vendor_Name': '',
            'Vendor_Email': '',
            'Vendor_Phone': '',
            'Vendor_Website': ''
        }
        
        # Extract email
        email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\. [A-Z|a-z]{2,}\b', text)
        if email_match:
            vendor_info['Vendor_Email'] = email_match.group(0)
        
        # Extract phone
        phone_match = re.search(r'(?:\+?91[\s-]?)?[6-9]\d{9}', text)
        if phone_match:
            vendor_info['Vendor_Phone'] =phone_match.group(0).replace

(' ', '')

        # Extract website
        website_match = re.search(r'(?:https?://)?(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}', text)
        if website_match:
            vendor_info['Vendor_Website'] = website_match.group(0)
        
        # Extract vendor name from signature
        if 'Regards,' in text:
            signature_block = text.split('Regards,')[-1].strip()
            lines = [line.strip() for line in signature_block.split('\n') if line.strip()]
            if lines:
                vendor_info['Vendor_Name'] = re.sub(r'[^a-zA-Z\s]', '', lines[0]).strip()
        
        return vendor_info
    
    def safe_float_conversion(self, value):
        try:
            return float(value.replace(',', '')) if value else None
        except:
            return None
    
    def extract_products(self, text):
        products = []
        
        # Pattern 1: Table format with price
        table_pattern = r'(\d+)\s+(.+?)\s+(\d+)\s+([\d,]+)\s+([\d,]+)'
        for match in re.finditer(table_pattern, text):
            products.append({
                'Product_Name': match.group(2).strip(),
                'Product_Quantity': int(match.group(3)),
                'Product_Price': self.safe_float_conversion(match.group(5))})
        
        # Pattern 2: Line items with optional price
        line_pattern = r'(.+?)\s+[-–]\s+(\d+)\s*(?:nos|units|qty)\s*[-–]?\s*([₹$]?[\d,]+)?'
        for match in re.finditer(line_pattern, text, re.IGNORECASE):
            product_name = match.group(1).strip()
            quantity = int(match.group(2))
            price = self.safe_float_conversion(match.group(3)) if match.group(3) else None
            products.append({
                'Product_Name': product_name,
                'Product_Quantity': quantity,
                'Product_Price': price
            })
        
        # Pattern 3: NLP-based extraction
        if not products:
            doc = nlp(text)
            current_product = {'name': '', 'qty': None, 'price': None}
            for ent in doc.ents:
                if ent.label_ == 'PRODUCT':
                    current_product['name'] = ent.text
                elif ent.label_ == 'QUANTITY' and 'nos' in ent.text.lower():
                    current_product['qty'] = int(re.search(r'\d+', ent.text).group())
                elif ent.label_ == 'MONEY':
                    current_product['price'] = self.safe_float_conversion(ent.text)
                
                if current_product['name'] and current_product['qty']:
                    products.append({
                        'Product_Name': current_product['name'],
                        'Product_Quantity': current_product['qty'],
                        'Product_Price': current_product['price']
                    })
                    current_product = {'name': '', 'qty': None, 'price': None}
        
        return products
    
    def process_email(self, email_msg):
        try:
            # Extract email content
            text_content = ''
            for part in email_msg.walk():
                if part.get_content_type() == 'text/plain':
                    text_content += part.get_payload(decode=True).decode('utf-8', 'ignore')
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode('utf-8', 'ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    text_content += '\n' + soup.get_text(separator=' ', strip=True)
            
            # Clean text
            text_content = re.sub(r'\s+', ' ', text_content).strip()
            
            # Extract vendor info
            vendor_info = self.extract_vendor_info(text_content)
            
            # Extract products
            products = self.extract_products(text_content)
            
            # Create records
            records = []
            for product in products:
                record = {
                    'Mail_Date': parser.parse(email_msg['Date']).strftime('%Y-%m-%d %H:%M:%S'),
                    'Mail_Subject': email_msg.get('Subject', 

'No Subject'), **product, **vendor_info } records.append(record)

            return records
        
        except Exception as e:
            print(f"Error processing email: {str(e)}")
            return []
    
    def process_emails(self, limit=50, save_path='output.xlsx'):
        self.connect_email()
        _, msg_ids = self.mail.search(None, 'ALL')
        all_data = []
        
        for msg_id in msg_ids[0].split()[-limit:]:
            try:
                _, msg_data = self.mail.fetch(msg_id, '(RFC822)')
                email_msg = message_from_bytes(msg_data[0][1])
                all_data.extend(self.process_email(email_msg))
            except Exception as e:
                print(f"Error processing email {msg_id.decode()}: {str(e)}")
                continue
        
        # Save to Excel
        df = pd.DataFrame(all_data, columns=self.output_columns)
        df.to_excel(save_path, index=False)
        return df

if __name__ == "__main__":
    parser = ProcurementEmailParser("C:\\Users\\one\\credentials.yml")
    df = parser.process_emails(limit=50, save_path="C:\\Users\\one\\vp_details.xlsx")
    print(f"Successfully processed {len(df)} records")


   

本文标签: pythonVendor and products details extract from GmailStack Overflow