admin管理员组文章数量:1355697
I have several images of animals in the same directory as the script. How can I modify the following script to process an image but force the output to only be a single selection from a list:
from pathlib import Path
import base64
import requests
def encode_image_to_base64(image_path):
"""Convert an image file to base64 string."""
return base64.b64encode(image_path.read_bytes()).decode('utf-8')
def extract_text_from_image(image_path):
"""Send image to local Llama API and get text description."""
base64_image = encode_image_to_base64(image_path)
payload = {
"model": "llama3.2-vision",
"stream": False,
"messages": [
{
"role": "user",
"content": (
"With just one word, classify this image into one of these exact categories:\n"
"- dog\n"
"- cat\n"
"- butterfly\n"
),
"images": [base64_image]
}
]
}
response = requests.post(
"http://localhost:11434/api/chat",
json=payload,
headers={"Content-Type": "application/json"}
)
return response.json().get('message', {}).get('content', 'No text extracted')
def process_directory():
"""Process all images in current directory and create text files."""
for image_path in Path('.').glob('*'):
if image_path.suffix.lower() in {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp'}:
print(f"\nProcessing {image_path}...")
text = extract_text_from_image(image_path)
image_path.with_suffix('.txt').write_text(text, encoding='utf-8')
print(f"Created {image_path.with_suffix('.txt')}")
process_directory()
However, despite different prompt engineering, I get some answers that will do more than just select from a list. For example, it may occassionally output "From the image, there is a winged insect, therefore my guess is "butterfly." ANSWER: Butterfly." If I define the list as allowed_options = ['dog', 'cat', 'butterfly']
I only want it to output a single string from that list and nothing else.
I have several images of animals in the same directory as the script. How can I modify the following script to process an image but force the output to only be a single selection from a list:
from pathlib import Path
import base64
import requests
def encode_image_to_base64(image_path):
"""Convert an image file to base64 string."""
return base64.b64encode(image_path.read_bytes()).decode('utf-8')
def extract_text_from_image(image_path):
"""Send image to local Llama API and get text description."""
base64_image = encode_image_to_base64(image_path)
payload = {
"model": "llama3.2-vision",
"stream": False,
"messages": [
{
"role": "user",
"content": (
"With just one word, classify this image into one of these exact categories:\n"
"- dog\n"
"- cat\n"
"- butterfly\n"
),
"images": [base64_image]
}
]
}
response = requests.post(
"http://localhost:11434/api/chat",
json=payload,
headers={"Content-Type": "application/json"}
)
return response.json().get('message', {}).get('content', 'No text extracted')
def process_directory():
"""Process all images in current directory and create text files."""
for image_path in Path('.').glob('*'):
if image_path.suffix.lower() in {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp'}:
print(f"\nProcessing {image_path}...")
text = extract_text_from_image(image_path)
image_path.with_suffix('.txt').write_text(text, encoding='utf-8')
print(f"Created {image_path.with_suffix('.txt')}")
process_directory()
However, despite different prompt engineering, I get some answers that will do more than just select from a list. For example, it may occassionally output "From the image, there is a winged insect, therefore my guess is "butterfly." ANSWER: Butterfly." If I define the list as allowed_options = ['dog', 'cat', 'butterfly']
I only want it to output a single string from that list and nothing else.
1 Answer
Reset to default 2Llama3.2-vision supports structured outputs, so you can specify the schema of the response you want, and the model should follow it. Your request should look like this:
payload = {
"model": "llama3.2-vision",
"stream": False,
"messages": [
{
"role": "user",
"content": (
"Classify this image into one of these exact categories:\n"
"- dog\n"
"- cat\n"
"- butterfly\n"
),
"images": [base64_image]
}
],
"format": {
"type": "object",
"properties": {
"animal": {
"enum": [
"dog",
"cat",
"butterfly"
],
"type": "string"
}
},
"required": [
"animal"
]
}
}
And now the output should be a JSON with an attribute animal
. For example:
{
"animal": "dog"
}
本文标签: pythonConstrain Llama32vision output to a list of optionsStack Overflow
版权声明:本文标题:python - Constrain Llama3.2-vision output to a list of options - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1744015946a2576330.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论