Python parallel thread that consume Watchdog queue

2020-04-19 06:15发布

问题:

I have this code that should put an event in a queue each time an external program (TCPdump) creates a *.pcap file in my directory. My problem is that I always get an empty queue, although I got the print from process() function.

What am I doing wrong? Is the queue correctly defined and shared between the two classes?

EDIT-----------------
I maybe understood why I got an empty queue, I think it is because I'm printing the queue that I initialized before it gets filled by Handler class. I modified my code and created two processes that should consume the same queue, but now the execution stuck on queue.put() and the thread ReadPcapFiles() stop running.

Here the updated code:

import time
import pyshark
import concurrent.futures
import threading
import logging
from queue import Queue
from multiprocessing import Process
from watchdog.observers import Observer, api
from watchdog.events import PatternMatchingEventHandler

class Handler(PatternMatchingEventHandler):
    patterns = ["*.pcap", "*.pcapng"]

    def __init__(self, queue):
        PatternMatchingEventHandler.__init__(self)
        self.queue = queue

    def process(self, event):
        #print(f'event type: {event.event_type}  path : {event.src_path}')   
        self.queue.put(event.src_path)
        logging.info(f"Storing message: {self.queue.qsize()}")
        print("Producer queue: ", list(self.queue.queue))
        #self.queue.get()

    def on_created(self, event):
        self.process(event)          


def StartWatcher(watchdogq, event):
    path = 'C:\\...'
    handler = Handler(watchdogq)
    observer = Observer()
    while not event.is_set():
        observer.schedule(handler, path, recursive=False)
        print("About to start observer")
        observer.start()
        try:
            while True:
                time.sleep(1)
        except Exception as error:
            observer.stop()
            print("Error: " + str(error))
        observer.join()


def ReadPcapFiles(consumerq, event):
    while not event.is_set() or not consumerq.empty():
        print("Consumer queue: ", consumerq.get())
        #print("Consumer queue: ", list(consumerq.queue))

    # pcapfile = pyshark.FileCapture(self.queue.get())
    #     for packet in pcapfile:
    #         countPacket +=1 

if __name__ == '__main__':
    format = "%(asctime)s: %(message)s"
    logging.basicConfig(format=format, level=logging.INFO,datefmt="%H:%M:%S")
    logging.getLogger().setLevel(logging.DEBUG)

    queue = Queue()
    event = threading.Event()
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(StartWatcher,queue, event)
        executor.submit(ReadPcapFiles,queue, event)

        time.sleep(0.1)
        logging.info("Main: about to set event")
        event.set()

OLD CODE:

import time
from queue import Queue
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler

class Handler(PatternMatchingEventHandler):
    patterns = ["*.pcap", "*.pcapng"]

    def __init__(self, queue):
        PatternMatchingEventHandler.__init__(self)
        self.queue = queue

    def process(self, event):
        print(f'event type: {event.event_type}  path : {event.src_path}')   
        self.queue.put(event.src_path)

    def on_created(self, event):
        self.process(event)

class Watcher():
    def __init__(self, path):
        self.queue = Queue()
        self.observer = Observer()
        self.handler = Handler(self.queue)
        self.path = path

    def start(self): 
        self.observer.schedule(self.handler, self.path, recursive=True)
        self.observer.start()
        try:
            while True:
                time.sleep(1)
                self.queue.get()
                print(list(self.queue.queue))
        except Exception as error:
            self.observer.stop()
            print("Error: " + str(error))
        self.observer.join()  

if __name__ == '__main__':
    watcher = Watcher('C:\\...')
    watcher.start()

回答1:

This is working for me (I got the main idea from this answer, thanks!) but notice that I consider this a workaround, so if someone has a better solution to this or can better explain the reason of such behavior in Python, please do not hesitate to answer!

My guess is that I had two main problems:
- I was starting Watchdog process inside another thread (and that was blocking somehow my queue consuming thread).
- Python threading does not work really in parallel and therefore starting an independent process was necessary.

Here my code:

import time
import pyshark
import threading
import logging
import os
from queue import Queue
from multiprocessing import Process, Pool
from watchdog.observers import Observer, api
from watchdog.events import PatternMatchingEventHandler
from concurrent.futures import ThreadPoolExecutor

class Handler(PatternMatchingEventHandler):
    patterns = ["*.pcap", "*.pcapng"]

    def __init__(self, queue):
        PatternMatchingEventHandler.__init__(self)
        self.queue = queue

    def process(self, event):  
        self.queue.put(event.src_path)
        logging.info(f"Storing message: {self.queue.qsize()}")
        print("Producer queue: ", list(self.queue.queue))


    def on_created(self, event):
        #wait that the transfer of the file is finished before processing it
        file_size = -1
        while file_size != os.path.getsize(event.src_path):
            file_size = os.path.getsize(event.src_path)
            time.sleep(1)

        self.process(event)         

def ConsumeQueue(consumerq):
    while True:
        if not consumerq.empty(): 
            pool = Pool()
            pool.apply_async(ReadPcapFiles, (consumerq.get(), ))
        else:    
            time.sleep(1)

def ReadPcapFiles(get_event):        
    createdFile = get_event
    print(f"This is my event in ReadPacapFile {createdFile}")

    countPacket = 0
    bandwidth = 0
    pcapfile = pyshark.FileCapture(createdFile)
    for packet in pcapfile:
        countPacket +=1
        bandwidth = bandwidth + int(packet.length)
    print(f"Packet nr {countPacket}")
    print(f"Byte per second {bandwidth}")


if __name__ == '__main__':

    format = "%(asctime)s: %(message)s"
    logging.basicConfig(format=format, level=logging.INFO,datefmt="%H:%M:%S")
    logging.getLogger().setLevel(logging.DEBUG)

    queue = Queue()
    path = 'C:\\...'

    worker = threading.Thread(target=ConsumeQueue, args=(queue, ), daemon=True)
    print("About to start worker")
    worker.start()

    event_handler = Handler(queue)
    observer = Observer()
    observer.schedule(event_handler, path, recursive=False)
    print("About to start observer")
    observer.start()

    try:
        while True:
            time.sleep(1)
    except Exception as error:
        observer.stop()
        print("Error: " + str(error))
    observer.join()


回答2:

There is an excellent library which provides concurrent access to the items within that queue. The queue is also persistent[file based as well as database based], so if the program crashes, you can still consume events from the point where the program crashed.

persist-queue