admin管理员组文章数量:1122850
c 中html抓取页面图片,[HTTP]
场景:
1.有时候需要对html页面里的image图片进行转换格式,比如把一种浏览器不支持的图片格式转换为另一种格式.
2.提取图片.
/*
* test_image.cpp
*
* Created on: 2013-11-6
* Author: Sai
*/
#include
#include
#include "gtest/gtest.h"
#include "tool/log/debug.h"
#include "main.h"
static char* GetHtmlImagePath(const char* html_path,const char* image_src)
{
//1.判断是file:///的情况.
const char *kAbsoluteTag = "file:///";
size_t image_length = strlen(image_src);
if(strstr(image_src,kAbsoluteTag) == image_src)
{
const char* p = image_src+strlen(kAbsoluteTag);
if(*p)
{
return strdup(p);
}
return NULL;
}
//1.判断相对路径.
const char *kRelativeTag = "..";
if(strstr(image_src,kRelativeTag) == image_src)
{
int count = 0;
const char* p = image_src;
const char* next_p = NULL;
while(p && *p)
{
++count;
p = p+2;
next_p = p;
p = strstr(p,kRelativeTag);
}
int index = strlen(html_path);
int dir_num = 0;
int count_1 = count;
while((--index) >=0)
{
if(html_path[index] == '/')
{
++dir_num;
if(dir_num > 1 && !(--count_1))
{
break;
}
}
}
if(count_1)
{
return strdup(image_src+3*(count-count_1));
}else
{
size_t left = index+1+(image_length+image_src-next_p);
char* str = (char*)malloc(left);
memset(str,0,left);
strncpy(str,html_path,index);
strncpy(str+index,next_p,image_length+image_src-next_p);
return str;
}
}
//1.image_src不带相对路径..
char* p = strrchr(html_path,'/');
size_t length = p-html_path+2+image_length;
char* str = (char*)malloc(length);
memset(str,0,length);
strncpy(str,html_path,p-html_path+1);
strncpy(str+(p-html_path+1),image_src,image_length);
return str;
}
TEST(test_image,GetImagePath)
{
const char* html = "./resources/22/22.html";
const char* jpg1 = "../image/1.jpg";
const char* jpg2 = "../../image/1.jpg";
const char* jpg3 = "image/1.jpg";
const char* jpg4 = "file:///C:/image/1.jpg";
const char* jpg5 = "../../../image/1.jpg";
char* str = GetHtmlImagePath(html,jpg1);
QXLOGSTR(str);
ASSERT_EQ(strcmp(str,"./resources/image/1.jpg"),0);
free(str);
str = GetHtmlImagePath(html,jpg2);
QXLOGSTR(str);
ASSERT_EQ(strcmp(str,"./image/1.jpg"),0);
free(str);
str = GetHtmlImagePath(html,jpg3);
QXLOGSTR(str);
ASSERT_EQ(strcmp(str,"./resources/22/image/1.jpg"),0);
free(str);
str = GetHtmlImagePath(html,jpg4);
QXLOGSTR(str);
ASSERT_EQ(strcmp(str,"C:/image/1.jpg"),0);
free(str);
str = GetHtmlImagePath(html,jpg5);
QXLOGSTR(str);
ASSERT_EQ(strcmp(str,"../image/1.jpg"),0);
free(str);
}
输出:
./resources/image/1.jpg
./image/1.jpg
./resources/22/image/1.jpg
C:/image/1.jpg
../image/1.jpg
如果image,src带有url转义字符,请用下边的函数转义:
本文标签: c 中html抓取页面图片http
版权声明:本文标题:c 中html抓取页面图片,[HTTP] 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/biancheng/1710594433a764064.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论