I have a HDFS to which I want to read and write using a Python script.
import requests
import json
import os
import kerberos
import sys
node = os.getenv("namenode").split(",")
print (node)
local_file_path = sys.argv[1]
remote_file_path = sys.argv[2]
read_or_write = sys.argv[3]
print (local_file_path,remote_file_path)
def check_node_status(node):
for name in node:
print (name)
request = requests.get("%s/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"%name,
verify=False).json()
status = request["beans"][0]["State"]
if status =="active":
nnhost = request["beans"][0]["HostAndPort"]
splitaddr = nnhost.split(":")
nnaddress = splitaddr[0]
print(nnaddress)
break
return status,name,nnaddress
def kerberos_auth(nnaddress):
__, krb_context = kerberos.authGSSClientInit("HTTP@%s"%nnaddress)
kerberos.authGSSClientStep(krb_context, "")
negotiate_details = kerberos.authGSSClientResponse(krb_context)
headers = {"Authorization": "Negotiate " + negotiate_details,
"Content-Type":"application/binary"}
return headers
def kerberos_hdfs_upload(status,name,headers):
print("running upload function")
if status =="active":
print("if function")
data=open('%s'%local_file_path, 'rb').read()
write_req = requests.put("%s/webhdfs/v1%s?op=CREATE&overwrite=true"%(name,remote_file_path),
headers=headers,
verify=False,
allow_redirects=True,
data=data)
print(write_req.text)
def kerberos_hdfs_read(status,name,headers):
if status == "active":
read = requests.get("%s/webhdfs/v1%s?op=OPEN"%(name,remote_file_path),
headers=headers,
verify=False,
allow_redirects=True)
if read.status_code == 200:
data=open('%s'%local_file_path, 'wb')
data.write(read.content)
data.close()
else :
print(read.content)
status, name, nnaddress= check_node_status(node)
headers = kerberos_auth(nnaddress)
if read_or_write == "write":
kerberos_hdfs_upload(status,name,headers)
elif read_or_write == "read":
print("fun")
kerberos_hdfs_read(status,name,headers)
The code works on my own machine which is not behind any proxy. But when running it in the office machine, which is behind a proxy, it is giving the following proxy error:
$ python3 python_hdfs.py ./1.png /user/testuser/2018-02-07_1.png write
['https://<servername>:50470', 'https:// <servername>:50470']
./1.png /user/testuser/2018-02-07_1.png
https://<servername>:50470
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 555, in urlopen
self._prepare_proxy(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 753, in _prepare_proxy
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 230, in connect
self._tunnel()
File "/usr/lib/python3.5/http/client.py", line 832, in _tunnel
message.strip()))
OSError: Tunnel connection failed: 504 Unknown Host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 376, in send
timeout=timeout
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 610, in urlopen
_stacktrace=sys.exc_info()[2])
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 273, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
requests.packages.urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='<servername>', port=50470): Max retries exceeded with url: /jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 504 Unknown Host',)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "python_hdfs.py", line 68, in <module>
status, name, nnaddress= check_node_status(node)
File "python_hdfs.py", line 23, in check_node_status
verify=False).json()
File "/usr/lib/python3/dist-packages/requests/api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 437, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='<server_name>', port=50470): Max retries exceeded with url: /jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 504 Unknown Host',)))
I tried giving proxy info in the code, like so:
proxies = {
"http": "<proxy_username>:<proxy_password>@<proxy_IP>:<proxy_port>",
"https": "<proxy_username>:<proxy_password>@<proxy_IP>:<proxy_port>",
}
node = os.getenv("namenode").split(",")
print (node)
local_file_path = sys.argv[1]
remote_file_path = sys.argv[2]
print (local_file_path,remote_file_path)
local_file_path = sys.argv[1]
remote_file_path = sys.argv[2]
read_or_write = sys.argv[3]
print (local_file_path,remote_file_path)
def check_node_status(node):
for name in node:
print (name)
request = requests.get("%s/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"%name,proxies=proxies,
verify=False).json()
status = request["beans"][0]["State"]
if status =="active":
nnhost = request["beans"][0]["HostAndPort"]
splitaddr = nnhost.split(":")
nnaddress = splitaddr[0]
print(nnaddress)
break
return status,name,nnaddress
### Rest of the code is the same
Now it is giving the following error:
$ python3 python_hdfs.py ./1.png /user/testuser/2018-02-07_1.png write
['https://<servername>:50470', 'https:// <servername>:50470']
./1.png /user/testuser/2018-02-07_1.png
https://<servername>:50470
Traceback (most recent call last):
File "python_hdfs.py", line 73, in <module>
status, name, nnaddress= check_node_status(node)
File "python_hdfs.py", line 28, in check_node_status
verify=False).json()
File "/usr/lib/python3/dist-packages/requests/api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 343, in send
conn = self.get_connection(request.url, proxies)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 254, in get_connection
proxy_manager = self.proxy_manager_for(proxy)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 160, in proxy_manager_for
**proxy_kwargs)
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 281, in proxy_from_url
return ProxyManager(proxy_url=url, **kw)
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 232, in __init__
raise ProxySchemeUnknown(proxy.scheme)
requests.packages.urllib3.exceptions.ProxySchemeUnknown: Not supported proxy scheme <proxy_username>
So, my question is, do I need to set up the proxy in kerberos for it to be working? If so, how? I am not too familiar with kerberos. I run kinit
before running the python code, in order to enter into the kerberos realm, which runs fine and connects to the appropriate HDFS servers without the proxy. So I don't know why this error occurs when reading or writing to the same HDFS servers. Any help is appreciated.
I also have the proxy set up in /etc/apt/apt.conf
like so:
Acquire::http::proxy "http://<proxy_username>:<proxy_password>@<proxy_IP>:<proxy_port>/";
Acquire::https::proxy "https://<proxy_username>:<proxy_password>@<proxy_IP>:<proxy_port>/";
I have also tried the following:
$ export http_proxy="http://<user>:<pass>@<proxy>:<port>"
$ export HTTP_PROXY="http://<user>:<pass>@<proxy>:<port>"
$ export https_proxy="http://<user>:<pass>@<proxy>:<port>"
$ export HTTPS_PROXY="http://<user>:<pass>@<proxy>:<port>"
import os
proxy = 'http://<user>:<pass>@<proxy>:<port>'
os.environ['http_proxy'] = proxy
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy
#rest of the code is same
But the error persists.
UPDATE: I have also tried the following.
Somebody suggested that we already have a proxy set up in
/etc/apt/apt.conf
to connect to the web. But maybe we don't need proxy to connect to the HDFS. So, try commenting the proxies in/etc/apt/apt.conf
, and run the python script again. I did that.$ env | grep proxy http_proxy=http://hfli:Test6969@192.168.44.217:8080 https_proxy=https://hfli:Test6969@192.168.44.217:8080 $ unset http_proxy $ unset https_proxy $ env | grep proxy $
And ran the python script again - (i) without defining proxies in the python script, and also (ii) with the proxies defined in the python script. I got the same original proxy error in both cases.
I found the following Java program that supposedly gives access to run Java programs on the HDFS:
import com.sun.security.auth.callback.TextCallbackHandler; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import java.io.BufferedReader; import java.io.InputStreamReader; import javax.security.auth.Subject; import javax.security.auth.login.LoginContext;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.UserGroupInformation;
public class HDFS_RW_Secure { public static void main(String[] args) throws Exception { System.setProperty("java.security.auth.login.config", "/tmp/sc3_temp/hadoop_kdc.txt"); System.setProperty("java.security.krb5.conf", "/tmp/sc3_temp/hadoop_krb.txt");
Configuration hadoopConf= new Configuration(); //this example use password login, you can change to use Keytab login LoginContext lc; Subject subject; lc = new LoginContext("JaasSample", new TextCallbackHandler()); lc.login(); System.out.println("login");subject = lc.getSubject(); UserGroupInformation.setConfiguration(hadoopConf); UserGroupInformation ugi = UserGroupInformation.getUGIFromSubject(subject); UserGroupInformation.setLoginUser(ugi); Path pt=new Path("hdfs://edhcluster"+args[0]); FileSystem fs = FileSystem.get(hadoopConf); //write FSDataOutputStream fin = fs.create(pt); fin.writeUTF("Hello!"); fin.close(); BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); String line; line=br.readLine(); while (line != null) { System.out.println(line); line=br.readLine(); } fs.close(); System.out.println("This is the end.");
} }
We need to take its jar file, HDFS.jar
, and run the following shell script to enable Java programs to be run on the HDFS.
nano run.sh
# contents of the run.sh file:
/tmp/sc3_temp/jre1.8.0_161/bin/java -Djavax.net.ssl.trustStore=/tmp/sc3_temp/cacerts -Djavax.net.ssl.trustStorePassword=changeit -jar /tmp/sc3_temp/HDFS.jar $1
So, I can run this shell script with /user/testuser
as the argument to give it access to run Java programs in the HDFS:
./run.sh /user/testuser/test2
which gives the following output:
Debug is true storeKey false useTicketCache false useKeyTab false doNotPrompt false ticketCache is null isInitiator true KeyTab is null refreshKrb5Config is false principal is null tryFirstPass is false useFirstPass is false storePass is false clearPass is false
Kerberos username [testuser]: testuser
Kerberos password for testuser:
[Krb5LoginModule] user entered username: testuser
principal is testuser@KRB.REALM
Commit Succeeded
login
2018-02-08 14:09:30,020 WARN [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(62)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Hello!
This is the end.
So, that's working I suppose. But how do I write an equivalent shell script to run Python codes?