Professional Documents
Culture Documents
1) JDK Installation
Reference : http://java.sun.com/j2se/1.5.0/install-linux.html
cd /usr/local
./jdk-1_5_0_07-linux-i586.bin
export PATH=/usr/local/jdk1.5.0_07/bin/:$PATH
export JAVA_HOME=/usr/local/jdk1.5.0_07
export CLASSPATH=.
2) Tomcat Setup
cd /tmp
wget http://apache.forbigweb.com/tomcat/tomcat-5/v5.5.17/bin/apache-tomcat-
5.5.17.tar.gz
mv apache-tomcat-5.5.17 /usr/share/tomcat5
3) Nutch Setup
Reference:http://lucene.apache.org/nutch/tutorial8.html
http://wiki.apache.org/nutch/FAQ#head-0c5dd359a76f9ac5ed54f9d81d79130e4c9c3302
cd /tmp
wget http://mirrors.isc.org/pub/apache/lucene/nutch/nutch-0.8.tar.gz
mv nutch-0.8 /usr/local/nutch
cd /usr/local/nutch
mkdir urls
vi ing
cp -a crawl-urlfilter.txt crawl-urlfilter.txt.orig
vi crawl-urlfilter.txt
#Replace *MY.DOMAIN.NAME with your site url
cp -a nutch-site.xml nutch-site.xml.orig
vi nutch-site.xml
#*******************
<property>
<name>http.agent.name</name>
<value>MES</value>
<description>HTTP 'User-Agent' request header. MUST NOT be empty -
please set this to a single word uniquely related to your organization.
http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version
</description>
</property>
<property>
<name>http.agent.description</name>
<value>MES BOT</value>
<description>Further description of our bot- this text is used in
the User-Agent header. It appears in parenthesis after the agent name.
</description>
</property>
<property>
<name>http.agent.url</name>
<value>http://megaesecure.com</value>
<description>A URL to advertise in the User-Agent header. This will
appear in parenthesis after the agent name. Custom dictates that this
should be a URL of a page explaining the purpose and behavior of this
crawler.
</description>
</property>
<property>
<name>http.agent.email</name>
<value>sharjeel at mega dot com</value>
<description>An email address to advertise in the HTTP 'From' request
header and User-Agent header. A good practice is to mangle this
address (e.g. 'info at example dot com') to avoid spamming.
</description>
</property>
#****************
cd /usr/local/nutch
cd /usr/share/tomcat/webapps
rm -rf ROOT*
cp nutch*.war /usr/share/tomcat/webapps/ROOT.war
cd /usr/local/nutch/crawl
/usr/share/tomcat5/bin/catalina.sh start