pipe large amount of data to stdin while using sub

2020-02-08 06:19发布

I'm kind of struggling to understand what is the python way of solving this simple problem.

My problem is quite simple. If you use the follwing code it will hang. This is well documented in the subprocess module doc.

import subprocess

proc = subprocess.Popen(['cat','-'],
                        stdin=subprocess.PIPE,
                        stdout=subprocess.PIPE,
                        )
for i in range(100000):
    proc.stdin.write('%d\n' % i)
output = proc.communicate()[0]
print output

Searching for a solution (there is a very insightful thread, but I've lost it now) I found this solution (among others) that uses an explicit fork:

import os
import sys
from subprocess import Popen, PIPE

def produce(to_sed):
    for i in range(100000):
        to_sed.write("%d\n" % i)
        to_sed.flush()
    #this would happen implicitly, anyway, but is here for the example
    to_sed.close()

def consume(from_sed):
    while 1:
        res = from_sed.readline()
        if not res:
            sys.exit(0)
            #sys.exit(proc.poll())
        print 'received: ', [res]

def main():
    proc = Popen(['cat','-'],stdin=PIPE,stdout=PIPE)
    to_sed = proc.stdin
    from_sed = proc.stdout

    pid = os.fork()
    if pid == 0 :
        from_sed.close()
        produce(to_sed)
        return
    else :
        to_sed.close()
        consume(from_sed)

if __name__ == '__main__':
    main()

While this solution is conceptually very easy to understand, it uses one more process and stuck as too low level compared to the subprocess module (that is there just to hide this kind of things...).

I'm wondering: is there a simple and clean solution using the subprocess module that won't hung or to implement this patter I have to do a step back and implement an old-style select loop or an explicit fork?

Thanks

10条回答
The star\"
2楼-- · 2020-02-08 06:21

Your code deadlocks as soon as cat's stdout OS pipe buffer is full. If you use stdout=PIPE; you have to consume it in time otherwise the deadlock as in your case may happen.

If you don't need the output while the process is running; you could redirect it to a temporary file:

#!/usr/bin/env python3
import subprocess
import tempfile

with tempfile.TemporaryFile('r+') as output_file:
    with subprocess.Popen(['cat'],
                          stdin=subprocess.PIPE,
                          stdout=output_file,
                          universal_newlines=True) as process:
        for i in range(100000):
            print(i, file=process.stdin)
    output_file.seek(0)  # rewind (and sync with the disk)
    print(output_file.readline(), end='')  # get  the first line of the output

If the input/output are small (fit in memory); you could pass the input all at once and get the output all at once using .communicate() that reads/writes concurrently for you:

#!/usr/bin/env python3
import subprocess

cp = subprocess.run(['cat'], input='\n'.join(['%d' % i for i in range(100000)]),
                    stdout=subprocess.PIPE, universal_newlines=True)
print(cp.stdout.splitlines()[-1]) # print the last line

To read/write concurrently manually, you could use threads, asyncio, fcntl, etc. @Jed provided a simple thread-based solution. Here's asyncio-based solution:

#!/usr/bin/env python3
import asyncio
import sys
from subprocess import PIPE

async def pump_input(writer):
     try:
         for i in range(100000):
             writer.write(b'%d\n' % i)
             await writer.drain()
     finally:
         writer.close()

async def run():
    # start child process
    # NOTE: universal_newlines parameter is not supported
    process = await asyncio.create_subprocess_exec('cat', stdin=PIPE, stdout=PIPE)
    asyncio.ensure_future(pump_input(process.stdin)) # write input
    async for line in process.stdout: # consume output
        print(int(line)**2) # print squares
    return await process.wait()  # wait for the child process to exit


if sys.platform.startswith('win'):
    loop = asyncio.ProactorEventLoop() # for subprocess' pipes on Windows
    asyncio.set_event_loop(loop)
else:
    loop = asyncio.get_event_loop()
loop.run_until_complete(run())
loop.close()

On Unix, you could use fcntl-based solution:

#!/usr/bin/env python3
import sys
from fcntl import fcntl, F_GETFL, F_SETFL
from os import O_NONBLOCK
from shutil import copyfileobj
from subprocess import Popen, PIPE, _PIPE_BUF as PIPE_BUF

def make_blocking(pipe, blocking=True):
    fd = pipe.fileno()
    if not blocking:
        fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK) # set O_NONBLOCK
    else:
        fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) & ~O_NONBLOCK) # clear it


with Popen(['cat'], stdin=PIPE, stdout=PIPE) as process:
    make_blocking(process.stdout, blocking=False)
    with process.stdin:
        for i in range(100000):
            #NOTE: the mode is block-buffered (default) and therefore
            # `cat` won't see it immidiately
            process.stdin.write(b'%d\n' % i)
            # a deadblock may happen here with a *blocking* pipe
            output = process.stdout.read(PIPE_BUF)
            if output is not None:
                sys.stdout.buffer.write(output)
    # read the rest
    make_blocking(process.stdout)
    copyfileobj(process.stdout, sys.stdout.buffer)
查看更多
何必那么认真
3楼-- · 2020-02-08 06:23

Here's something I used to load 6G mysql dump file loads via subprocess. Stay away from shell=True. Not secure and start out of process wasting resources.

import subprocess

fhandle = None

cmd = [mysql_path,
      "-u", mysql_user, "-p" + mysql_pass],
      "-h", host, database]

fhandle = open(dump_file, 'r')
p = subprocess.Popen(cmd, stdin=fhandle, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

(stdout,stderr) = p.communicate()

fhandle.close()
查看更多
甜甜的少女心
4楼-- · 2020-02-08 06:29

Here is an example (Python 3) of reading one record at a time from gzip using a pipe:

cmd = 'gzip -dc compressed_file.gz'
pipe = Popen(cmd, stdout=PIPE).stdout

for line in pipe:
    print(":", line.decode(), end="")

I know there is a standard module for that, it is just meant as an example. You can read the whole output in one go (like shell back-ticks) using the communicate method, but obviously you hav eto be careful of memory size.

Here is an example (Python 3 again) of writing records to the lp(1) program on Linux:

cmd = 'lp -'
proc = Popen(cmd, stdin=PIPE)
proc.communicate(some_data.encode())
查看更多
放荡不羁爱自由
5楼-- · 2020-02-08 06:30

If you don't want to keep all the data in memory, you have to use select. E.g. something like:

import subprocess
from select import select
import os

proc = subprocess.Popen(['cat'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)

i = 0;
while True:
    rlist, wlist, xlist = [proc.stdout], [], []
    if i < 100000:
        wlist.append(proc.stdin)
    rlist, wlist, xlist = select(rlist, wlist, xlist)
    if proc.stdout in rlist:
        out = os.read(proc.stdout.fileno(), 10)
        print out,
        if not out:
            break
    if proc.stdin in wlist:
        proc.stdin.write('%d\n' % i)
        i += 1
        if i >= 100000:
            proc.stdin.close()
查看更多
啃猪蹄的小仙女
6楼-- · 2020-02-08 06:30

For this kind of thing, the shell works a lot better than subprocess.

Write very simple Python apps which read from sys.stdin and write to sys.stdout.

Connect the simple apps together using a shell pipeline.

If you want, start the pipeline using subprocess or simply write a one-line shell script.

python part1.py | python part2.py

This is very, very efficient. It's also portable to all Linux (and Windows) as long as you keep it very simple.

查看更多
男人必须洒脱
7楼-- · 2020-02-08 06:32

The simplest solution I can think of:

from subprocess import Popen, PIPE
from threading import Thread

s = map(str,xrange(10000)) # a large string
p = Popen(['cat'], stdin=PIPE, stdout=PIPE, bufsize=1)
Thread(target=lambda: any((p.stdin.write(b) for b in s)) or p.stdin.close()).start()
print (p.stdout.read())

Buffered version:

from subprocess import Popen, PIPE
from threading import Thread

s = map(str,xrange(10000)) # a large string
n = 1024 # buffer size
p = Popen(['cat'], stdin=PIPE, stdout=PIPE, bufsize=n)
Thread(target=lambda: any((p.stdin.write(c) for c in (s[i:i+n] for i in xrange(0, len(s), n)))) or p.stdin.close()).start()
print (p.stdout.read())
查看更多
登录 后发表回答