# 使用说明:在bin目录下建立 runbot.sh ,如果在window下执行的话,则使用 cygwin 来模拟使用
# bin/runbot.sh
# runbot script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/runbot [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
depth=4
threads=8
adddays=5
topN=1500000 #Comment this statement if you don't want to set topN value
# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"
# Parse arguments
if [ "$1" == "safe" ]
then
safe=yes
fi
if [-z "$NUTCH_HOME" ]
then
NUTCH_HOME=.
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
NUTCH_HOME=/cygdrive/f/nutch
echo print nutch: $NUTCH_HOME
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/cygdrive/e/tomcat
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="-topN $topN"
else
topN=""
fi
steps=8
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb urls
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN \
-adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth `expr $i + 1` failed."
echo "runbot: Deleting segment $segment."
rm $RMARGS $segment
continue
fi
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Merge Segments (Step 3 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/segments
else
rm $RMARGS crawl/BACKUPsegments
mv $MVARGS crawl/segments crawl/BACKUPsegments
fi
mv $MVARGS crawl/MERGEDsegments crawl/segments
echo "----- Invert Links (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb \
crawl/segments/*
echo "----- Dedup (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/NEWindex crawl/NEWindexes
echo "----- Loading New Index (Step 8 of $steps) -----"
${CATALINA_HOME}/bin/shutdown.sh
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/NEWindexes
rm $RMARGS crawl/index
else
rm $RMARGS crawl/BACKUPindexes
rm $RMARGS crawl/BACKUPindex
mv $MVARGS crawl/NEWindexes crawl/BACKUPindexes
mv $MVARGS crawl/index crawl/BACKUPindex
fi
mv $MVARGS crawl/NEWindex crawl/index
${CATALINA_HOME}/bin/startup.sh
echo "runbot: FINISHED: Crawl completed!"
echo ""
//断电停止工作后,再执行这个。
// 刚开始我也不怎么喜欢shell这种方式的代码,不过幸好邻居是搞shell编程的,经它一分析,原来shell也是这么简单。
//来源:http://wiki.apache.org/nutch/Crawl
分享到:
相关推荐
Nutch 1.2 学习笔记,讲的比较清楚的文档
apache-nutch-2.3.1-src.tar.gz
配置好的nutch1.2 java工程,由于上传文件大小限制,只缺少plugins没有传.感觉配置过程太麻烦了,所以感觉有必要分享一下。
1.1什么是nutch..1 1.2研究nutch的原因...1 1.3 nutch的目标..1 1.4 nutch VS lucene.....2 2. nutch的安装与配置.....3 2.1 JDK的安装与配置.3 2.2 nutch的安装与配置........5 2.3 tomcat的安装与配置......5 3. ...
1.1什么是nutch..1 1.2研究nutch的原因...1 1.3 nutch的目标..1 1.4 nutch VS lucene.....2 2. nutch的安装与配置.....3 2.1 JDK的安装与配置.3 2.2 nutch的安装与配置........5 2.3 tomcat的安装与配置........
Nutch开源搜索引擎增量索引recrawl的终极解决办法续
Nutch是一款刚刚诞生的完整的开源搜索引擎系统,可以结合数据库进行索引,能快速构建所需系统。Nutch 是基于Lucene的,Lucene为 Nutch 提供了文本索引和搜索的API,所以它使用Lucene作为索引和检索的模块。Nutch的...
Nutch 是一个开源Java 实现的搜索引擎。这里是它的安装包。
Nutch开源搜索引擎增量索引recrawl的终极解决办法
配置好的nutch1 2 java工程 由于上传文件大小限制 只缺少plugins 自己可以添加
nutch1.2源码,可与hadoop分布式布置,欢迎下载
nutch1.2测试文档
nutch官方简单案例,请版本是nutch-1.2.war
将nutch1.2源码嵌入到myeclipse8.5 在window环境的nutch1.2的简单应用 将其放于tomcat的webapp下
nutch2.1导入Eclipse过程中,import org.restlet.×;错误,缺少的包。
Windows下cygwin+MyEclipse 8.5+Nutch1.2+Tomcat 6.0 Windows下cygwin+MyEclipse 8.5+Nutch1.2+Tomcat 6.0 Windows下cygwin+MyEclipse 8.5+Nutch1.2+Tomcat 6.0
nutch一款开源搜索引擎,recrawl是实现索引更新的脚本 mergecrawl是合并多个网站查询的bash脚本。
nutch Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎...
apache-nutch-1.3 的源码包,需要的可以看下
nutch1.8最新版2014.6.10part2