通过curl工具写的一个C语言版网页爬虫工具,主要在vim里方便使用!
代码片段和文件信息
#include
#include
#include
#include
typedef struct _img_url_lnk {
char *url;
struct _img_url_lnk *pre;
struct _img_url_lnk *next;
} img_url_lnk;
static const char g_filter[][32] = {
“html“
“htm“
};
static img_url_lnk *g_first_img_url = NULL;
static img_url_lnk *g_last_img_url = NULL;
static img_url_lnk* img_url_malloc(void)
{
img_url_lnk *ret = (img_url_lnk *)malloc(sizeof(img_url_lnk));
if (!ret)
return NULL;
ret->url = NULL;
ret->pre = NULL;
ret->next = NULL;
}
static void img_url_free(img_url_lnk *img)
{
if (!img)
return;
if (img->url)
free(img->url);
img->pre = NULL;
img->next = NULL;
}
static int img_url_push(img_url_lnk *img)
{
int ret = -1;
if (!img)
return ret;
if (!g_first_img_url) {
g_first_img_url = img;
g_last_img_url = g_first_img_url;
} else {
img->pre = g_last_img_url;
g_last_img_url->next = img;
g_last_img_url = img;
}
return 0;
}
static img_url_lnk* img_url_pop(void)
{
img_url_lnk *tmp = g_last_img_url;
if (!tmp)
return tmp;
if (g_last_img_url == g_first_img_url) {
g_first_img_url = g_last_img_url = NULL;
} else {
g_last_img_url = g_last_img_url->pre;
}
return tmp;
}
static int update_url_name(const char *url char *html_name)
{
const char *find = NULL;
if (!url)
return NULL;
int url_len = strlen(url);
int i = 0;
int j = 0;
if (!find)
find = url;
for (j = 0; j < url_len; j++) {
if (find[j] == ‘/‘) {
html_name[j] = ‘_‘;
continue;
}
html_name[j] = find[j];
}
printf(“[%s] name = %s
“ __FUNCTION__ html_name);
return url_len;
}
static int gen_html_name(const char *url char *html_name)
{
const char *find = NULL;
if (!url)
return NULL;
int url_len = strlen(url);
int i = 0;
int j = 0;
for (i=url_len-1; i>=0; i-- ) {
if (url[i] == ‘/‘ || url[i] == ‘ ‘) {
find = url+i+1;
if ((url_len = strlen(find)) > 0) {
break;
}
} else if (url[i] == ‘%‘) {
find = url+i+3;
if ((url_len = strlen(find)) > 0) {
break;
}
} else if (url_len-i>16) {
find = url+i+1;
url_len = strlen(find);
break;
}
}
if (!find)
find = url;
for (j = 0; j < url_len; j++) {
if (find[j] == ‘/‘) {
html_name[j] = ‘_‘;
continue;
}
html_name[j] = find[j];
}
printf(“[%s] name = %s
“ __FUNCTION__ html_name);
return url_len;
}
static int get_html_inner_urls_ex(char *buf)
{
int ret = -1;
int buf_len = 0;
int push_flag = 0;
char url[512] = { ‘ ‘ };
img_url_lnk *tmp = NULL;
int i = 0 j = 0;
if (!buf)
ret
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件举报,一经查实,本站将立刻删除。
评论列表(条)