You are on page 1of 3

How to install Nutch 0.

1) JDK Installation

Reference : http://java.sun.com/j2se/1.5.0/install-linux.html

cd /usr/local

Download JDK 5.0 Update 7 (jdk-1_5_0_07-linux-i586.bin file)from


http://java.sun.com/javase/downloads/index.jsp

chmod 755 jdk-1_5_0_07-linux-i586.bin

./jdk-1_5_0_07-linux-i586.bin

export PATH=/usr/local/jdk1.5.0_07/bin/:$PATH
export JAVA_HOME=/usr/local/jdk1.5.0_07
export CLASSPATH=.

2) Tomcat Setup

cd /tmp
wget http://apache.forbigweb.com/tomcat/tomcat-5/v5.5.17/bin/apache-tomcat-
5.5.17.tar.gz

tar zxvf apache-tomcat-5.5.17.tar.gz

mv apache-tomcat-5.5.17 /usr/share/tomcat5

3) Nutch Setup

Reference:http://lucene.apache.org/nutch/tutorial8.html
http://wiki.apache.org/nutch/FAQ#head-0c5dd359a76f9ac5ed54f9d81d79130e4c9c3302

cd /tmp

wget http://mirrors.isc.org/pub/apache/lucene/nutch/nutch-0.8.tar.gz

tar zxvf nutch-0.8.tar.gz

mv nutch-0.8 /usr/local/nutch

cd /usr/local/nutch
mkdir urls
vi ing

#add the line below


http://ing.clients.megaesecure.com

cp -a crawl-urlfilter.txt crawl-urlfilter.txt.orig

vi crawl-urlfilter.txt
#Replace *MY.DOMAIN.NAME with your site url

cp -a nutch-site.xml nutch-site.xml.orig
vi nutch-site.xml

Add the following lines between configuration tag

#*******************
<property>
<name>http.agent.name</name>
<value>MES</value>
<description>HTTP 'User-Agent' request header. MUST NOT be empty -
please set this to a single word uniquely related to your organization.

NOTE: You should also check other related properties:

http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version

and set their values appropriately.

</description>
</property>

<property>
<name>http.agent.description</name>
<value>MES BOT</value>
<description>Further description of our bot- this text is used in
the User-Agent header. It appears in parenthesis after the agent name.
</description>
</property>

<property>
<name>http.agent.url</name>
<value>http://megaesecure.com</value>
<description>A URL to advertise in the User-Agent header. This will
appear in parenthesis after the agent name. Custom dictates that this
should be a URL of a page explaining the purpose and behavior of this
crawler.
</description>
</property>

<property>
<name>http.agent.email</name>
<value>sharjeel at mega dot com</value>
<description>An email address to advertise in the HTTP 'From' request
header and User-Agent header. A good practice is to mangle this
address (e.g. 'info at example dot com') to avoid spamming.
</description>
</property>

#****************

cd /usr/local/nutch

bin/nutch crawl urls -dir datacache -depth 2 -topN 50

cd /usr/share/tomcat/webapps
rm -rf ROOT*

cp nutch*.war /usr/share/tomcat/webapps/ROOT.war

cd /usr/local/nutch/crawl

/usr/share/tomcat5/bin/catalina.sh start