Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NUTCH-2856] Implement a protocol-smb plugin based on hierynomus/smbj #826

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
ivy/apache-rat-*
.vscode
crawl
lewismc marked this conversation as resolved.
Show resolved Hide resolved
urls
solr_datadir
3 changes: 2 additions & 1 deletion conf/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
<Appenders>
<RollingFile name="RollingFile" fileName="${hadoop.log.dir}/${hadoop.log.file}"
filePattern="${hadoop.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />
<!--<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />-->
<PatternLayout pattern="%d %p %c [%t] %m%n" />
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<CronTriggeringPolicy schedule="0 0 0 * * ?" evaluateOnStartup="true" />
<DefaultRolloverStrategy>
<Delete basePath="${hadoop.log.dir}" maxDepth="2">
Expand Down
32 changes: 32 additions & 0 deletions runNutch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#/bin/bash
lewismc marked this conversation as resolved.
Show resolved Hide resolved
echo "Will remove existing CrawlDb..."
sleep 5
echo "Removing existing CrawlDb..."
rm -rf crawl/*

./runtime/local/bin/nutch inject crawl/crawldb urls

while true
do
./runtime/local/bin/nutch generate crawl/crawldb crawl/segments/
segment=`ls crawl/segments/ | tail -1`
echo "Found segment $segment"
sleep 5
if [ "$?" == "0" ] && [ ! -z "$segment" ]
then
./runtime/local/bin/nutch fetch crawl/segments/$segment
if [ "$?" == "0" ]
then
sleep 5
./runtime/local/bin/nutch parse crawl/segments/$segment
sleep 5
./runtime/local/bin/nutch updatedb crawl/crawldb crawl/segments/$segment
sleep 5
./runtime/local/bin/nutch index crawl/crawldb crawl/segments/$segment
sleep 10
rm -rf crawl/segments/$segment
fi
else
sleep 30
fi
done
1 change: 1 addition & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
<ant dir="protocol-httpclient" target="deploy"/>
<ant dir="protocol-interactiveselenium" target="deploy" />
<ant dir="protocol-okhttp" target="deploy"/>
<ant dir="protocol-smb" target="deploy"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<ant dir="protocol-selenium" target="deploy" />
<ant dir="publish-rabbitmq" target="deploy"/>
<ant dir="scoring-depth" target="deploy"/>
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/protocol-smb/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="protocol-smb" default="jar-core">

<import file="../build-plugin.xml"/>

</project>
47 changes: 47 additions & 0 deletions src/plugin/protocol-smb/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd"
xmlns:ns0="http://ant.apache.org/ivy/maven" version="2.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../..//ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
<dependency org="com.hierynomus" name="smbj" rev="0.13.0"/>
<dependency org="net.engio" name="mbassador" rev="1.3.0"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<dependency org="org.bouncycastle" name="bcprov-jdk18on" rev="1.75"/>
<dependency org="com.hierynomus" name="asn-one" rev="0.6.0"/>
<dependency org="commons-io" name="commons-io" rev="2.17.0"/>
HiranChaudhuri marked this conversation as resolved.
Show resolved Hide resolved
</dependencies>

</ivy-module>
53 changes: 53 additions & 0 deletions src/plugin/protocol-smb/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<plugin
id="protocol-smb"
name="SMB Protocol based on https://github.com/hierynomus/smbj"
version="1.0.0"
provider-name="Hiran Chaudhuri">

<runtime>
<library name="asn-one-0.6.0.jar"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<library name="bcprov-jdk18on-1.75.jar"/>
<library name="mbassador-1.3.0.jar"/>
<library name="protocol-smb.jar">
<export name="*"/>
</library>
<library name="smbj-0.13.0.jar"/>

<library name="commons-io-2.17.0.jar"/>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
</requires>

<extension id="org.apache.nutch.protocol.smb"
name="SmbProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.smb.Smb"
class="org.apache.nutch.protocol.smb.Smb">
<parameter name="protocolName" value="smb"/>
<parameter name="urlStreamHandler" value="org.apache.nutch.protocol.smb.Handler"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
</implementation>

</extension>

</plugin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.smb;

import java.net.URL;
import java.net.URLConnection;
import java.net.URLStreamHandler;

public class Handler extends URLStreamHandler {
lewismc marked this conversation as resolved.
Show resolved Hide resolved

@Override
protected URLConnection openConnection(URL u) {
return new SmbURLConnection(u);
}
}
Loading