#!/bin/bash

url_path="-";

WGET_TIMECOUNT=2;

WGET_TIME=10;

FORK_SLEEP_TIME=1;

ONEURL_SLEEP_TIME=1;

SPIDER_PID_NUM=6;

function usage(){

echo"usage:spider.sh -u url_path -d page_store_dir";exit 3;

}

function version(){

echo"same-source-tools-spider-1.0.0";exit 4;

}while getopts l:u:d:t:T:s:S:p:OPTIONdocase$OPTIONin

u)url_path=${OPTARG};;

d)spider_dir=${OPTARG};;

t)WGET_TIMECOUNT=${OPTARG};;

T)WGET_TIME=${OPTARG};;

s)FORK_SLEEP_TIME=${OPTARG};;

S)ONEURL_SLEEP_TIME=${OPTARG};;

p)SPIDER_PID_NUM=${OPTARG};;

l)LOG_PATH=${OPTARG};;

h)usage;;

v)version;;/?)usage;;

esac

done

touch ${LOG_PATH};#检查抓取文件是否存在

if [ -e ${url_path} ]; then

echo"spider test: ${url_path} is exist";elseecho"url_path spider test: ${url_path} is not exist";exit 1;

fi#检查存储网页的目录是否存在

if [ -e ${spider_dir} ]; then

echo"spider test: ${spider_dir} is exist";elseecho"spider_dir spider test: ${spider_dir} is not exist";exit 2;

fi#清除原来的url文件

url_first_path="${spider_dir}/url_0";if [ -e ${url_first_path} ]; then

rm ${spider_dir}/url_*;

fi#创建url多进程抓取文件

for ((i=0;i

touch ${spider_dir}/url_${i};

}

doneno=0;#向url多进程抓取文件中写入抓取的url

cat ${url_path} | while readlinedoecho$line >> ${spider_dir}/url_${no};no=$(($no+1));if [ $no -ge ${SPIDER_PID_NUM} ]; thenno=0;

fi;

done#开始多进程抓取

for ((i=0;i

sleep${FORK_SLEEP_TIME};

{

url_path="${spider_dir}/url_${i}";if [ -e $url_path]; then

cat ${url_path}| /

while readurldo

sleep${ONEURL_SLEEP_TIME};

url_md5=`echo ${url} | md5sum | awk -F" " '{print $1}'`;

wget"${url}" -o ${LOG_PATH}_${url_md5} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};#wget ${url} -o ${LOG_PATH}_${url_md5} -a ${LOG_PATH} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};

dateFlag=`date +"%Y%m%d-%H:%M:%S"`;if [ $? -eq 0]; then

echo"${dateFlag} NOTICE:spiderwgetsuccess ${url}";elseecho"${dateFlag} ERROR:spiderwgeterror ${url}";

rm ${spider_dir}/${url_md5};

fi

doneelse

continue;

fi

}&donewait

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐