admin管理员组

文章数量:1122850

c 中html抓取页面图片,[HTTP]

场景:

1.有时候需要对html页面里的image图片进行转换格式,比如把一种浏览器不支持的图片格式转换为另一种格式.

2.提取图片.

/*

* test_image.cpp

*

* Created on: 2013-11-6

* Author: Sai

*/

#include

#include

#include "gtest/gtest.h"

#include "tool/log/debug.h"

#include "main.h"

static char* GetHtmlImagePath(const char* html_path,const char* image_src)

{

//1.判断是file:///的情况.

const char *kAbsoluteTag = "file:///";

size_t image_length = strlen(image_src);

if(strstr(image_src,kAbsoluteTag) == image_src)

{

const char* p = image_src+strlen(kAbsoluteTag);

if(*p)

{

return strdup(p);

}

return NULL;

}

//1.判断相对路径.

const char *kRelativeTag = "..";

if(strstr(image_src,kRelativeTag) == image_src)

{

int count = 0;

const char* p = image_src;

const char* next_p = NULL;

while(p && *p)

{

++count;

p = p+2;

next_p = p;

p = strstr(p,kRelativeTag);

}

int index = strlen(html_path);

int dir_num = 0;

int count_1 = count;

while((--index) >=0)

{

if(html_path[index] == '/')

{

++dir_num;

if(dir_num > 1 && !(--count_1))

{

break;

}

}

}

if(count_1)

{

return strdup(image_src+3*(count-count_1));

}else

{

size_t left = index+1+(image_length+image_src-next_p);

char* str = (char*)malloc(left);

memset(str,0,left);

strncpy(str,html_path,index);

strncpy(str+index,next_p,image_length+image_src-next_p);

return str;

}

}

//1.image_src不带相对路径..

char* p = strrchr(html_path,'/');

size_t length = p-html_path+2+image_length;

char* str = (char*)malloc(length);

memset(str,0,length);

strncpy(str,html_path,p-html_path+1);

strncpy(str+(p-html_path+1),image_src,image_length);

return str;

}

TEST(test_image,GetImagePath)

{

const char* html = "./resources/22/22.html";

const char* jpg1 = "../image/1.jpg";

const char* jpg2 = "../../image/1.jpg";

const char* jpg3 = "image/1.jpg";

const char* jpg4 = "file:///C:/image/1.jpg";

const char* jpg5 = "../../../image/1.jpg";

char* str = GetHtmlImagePath(html,jpg1);

QXLOGSTR(str);

ASSERT_EQ(strcmp(str,"./resources/image/1.jpg"),0);

free(str);

str = GetHtmlImagePath(html,jpg2);

QXLOGSTR(str);

ASSERT_EQ(strcmp(str,"./image/1.jpg"),0);

free(str);

str = GetHtmlImagePath(html,jpg3);

QXLOGSTR(str);

ASSERT_EQ(strcmp(str,"./resources/22/image/1.jpg"),0);

free(str);

str = GetHtmlImagePath(html,jpg4);

QXLOGSTR(str);

ASSERT_EQ(strcmp(str,"C:/image/1.jpg"),0);

free(str);

str = GetHtmlImagePath(html,jpg5);

QXLOGSTR(str);

ASSERT_EQ(strcmp(str,"../image/1.jpg"),0);

free(str);

}

输出:

./resources/image/1.jpg

./image/1.jpg

./resources/22/image/1.jpg

C:/image/1.jpg

../image/1.jpg

如果image,src带有url转义字符,请用下边的函数转义:

本文标签: c 中html抓取页面图片http