I would like to be able to construct a raw HTTP request and send it with a socket. Obviously, you would like me to use something like urllib and urllib2 but I do not want to use that.
It would have to look something like this:
import socket
tcpsoc = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
tcpsoc.bind(('72.14.192.58', 80)) #bind to googles ip
tcpsoc.send('HTTP REQUEST')
response = tcpsoc.recv()
Obviously you would also have to request the page/file and get and post parameters
Most of what you need to know is in the HTTP/1.1 spec, which you should definitely study if you want to roll your own HTTP implementation: http://www.w3.org/Protocols/rfc2616/rfc2616.html
import socket
import urlparse
CONNECTION_TIMEOUT = 5
CHUNK_SIZE = 1024
HTTP_VERSION = 1.0
CRLF = "\r\n\r\n"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks = []
while True:
chunk = sock.recv(int(chunk_size))
if chunk:
chunks.append(chunk)
else:
break
return ''.join(chunks)
def get(url, **kw):
kw.setdefault('timeout', CONNECTION_TIMEOUT)
kw.setdefault('chunk_size', CHUNK_SIZE)
kw.setdefault('http_version', HTTP_VERSION)
kw.setdefault('headers_only', False)
kw.setdefault('response_code_only', False)
kw.setdefault('body_only', False)
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(kw.get('timeout'))
sock.connect((url.netloc, url.port or 80))
msg = 'GET {0} HTTP/{1} {2}'
sock.sendall(msg.format(url.path or '/', kw.get('http_version'), CRLF))
data = receive_all(sock, chunk_size=kw.get('chunk_size'))
sock.shutdown(socket.SHUT_RDWR)
sock.close()
data = data.decode(errors='ignore')
headers = data.split(CRLF, 1)[0]
request_line = headers.split('\n')[0]
response_code = request_line.split()[1]
headers = headers.replace(request_line, '')
body = data.replace(headers, '').replace(request_line, '')
if kw['body_only']:
return body
if kw['headers_only']:
return headers
if kw['response_code_only']:
return response_code
else:
return data
print(get('http://www.google.com/'))
Yes, basically you just have to write text, something like :
GET /pageyouwant.html HTTP/1.1[CRLF]
Host: google.com[CRLF]
Connection: close[CRLF]
User-Agent: MyAwesomeUserAgent/1.0.0[CRLF]
Accept-Encoding: gzip[CRLF]
Accept-Charset: ISO-8859-1,UTF-8;q=0.7,*;q=0.7[CRLF]
Cache-Control: no-cache[CRLF]
[CRLF]
Feel free to remove / add headers at will.
For a working example to guide you, you might want to take a look at libcurl, a library written in the C language that:
does what you want and much more;
is a snap to use;
is widely deployed; and
is actively supported.
It's a beautiful thing and one of the best examples of what open source can and should be.