Parsing a (modified) RIS file with Python

2019-08-16 01:08发布

I have a bunch of (modified) RIS files. The toy example looks like the following:

Record #1 of 2
ID: CN-01160769
AU: Uedo N
AU: Kasiser R
TI: Development of an E-learning system
SO: United European Gastroenterology Journal
YR: 2015


Record #2 of 2
ID: CN-01070265
AU: Krogh LQ
TI: E-learning in pediatric basic life support
SO: Resuscitation
YR: 2015

In brief, each record starts with Record # line and ends with two blank lines. The task is to parse the file and extract tags and fields.

Pasted below is my current code (adapted from here):

import re

class RIS:
    """ RIS file structure """
    def __init__(self, in_file=None):
        """ Initialize and parse input """
        self.records = []
        if in_file:
            self.parse(in_file)

    def parse(self, in_file):
        """ Parse input file """
        self.current_tag = None
        self.current_record = None
        prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
        lines = []
        # Eliminate blank lines
        for line in in_file:
            line = line.strip()
            if len(line) > 0:
                lines.append(line)
        for line in lines:
            match = prog.match(line)
            if match:
                tag = match.groups()[0]
                field = match.groups()[1]
                self.process_field(tag, field)
            else:
                raise ValueError(line)

    def process_field(self, tag, field):
        """ Process RIS file field """
        if tag == "ID":
            self.current_record = {tag: field}
        elif tag == "YR":
            self.records.append(self.current_record)
            self.current_record = None
        elif tag in ["AU", "AD"]:
            if tag in self.current_record:
                self.current_record[tag].append(field)
            else:
                self.current_record[tag] = [field]
        else:
            if not tag in self.current_record:
                self.current_record[tag] = field
            else:
                error_str = "Duplicate tag: %s" % tag
                raise ValueError(error_str)

def main():
    """ Test the code """
    import pprint
    with open("test.ris", "rt") as ris_file:
        ris = RIS(ris_file)
        pp = pprint.PrettyPrinter()
        pp.pprint(ris.records)

if __name__ == "__main__":
    main()

The current code doesn't work, because it doesn't recognize the start tag (e.g., Record 1 of 2) and in addition it doesn't know where the record stops. In the current version of the code I add ID as a start tag and YR as stop tag. However, the code exit with the error:

ValueError: Record #1 of 2

Any suggestions how to properly adapt the code are greatly welcome.

1条回答
smile是对你的礼貌
2楼-- · 2019-08-16 01:32

you just need add a judge and break the Record #x of 2 line.

import re

class RIS:
    """ RIS file structure """
    def __init__(self, in_file=None):
        """ Initialize and parse input """
        self.records = []
        if in_file:
            self.parse(in_file)

    def parse(self, in_file):
        """ Parse input file """
        self.current_tag = None
        self.current_record = None
        prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
        lines = []
        # Eliminate blank lines
        for line in in_file:
            line = line.strip()
            if len(line) > 0:
                lines.append(line)
        for line in lines:
            if "#" in line:
                continue
            match = prog.match(line)
            if match:
                tag = match.groups()[0]
                field = match.groups()[1]
                self.process_field(tag, field)
            else:
                raise ValueError(line)

    def process_field(self, tag, field):
        """ Process RIS file field """
        if tag == "ID":
            self.current_record = {tag: field}
        elif tag == "YR":
            self.records.append(self.current_record)
            self.current_record = None
        elif tag in ["AU", "AD"]:
            if tag in self.current_record:
                self.current_record[tag].append(field)
            else:
                self.current_record[tag] = [field]
        else:
            if not tag in self.current_record:
                self.current_record[tag] = field
            else:
                error_str = "Duplicate tag: %s" % tag
                raise ValueError(error_str)

def main():
    """ Test the code """
    import pprint
    with open("test.ris", "rt") as ris_file:
        ris = RIS(ris_file)
        pp = pprint.PrettyPrinter()
        pp.pprint(ris.records)

if __name__ == "__main__":
    main()

the add code:

if "#" in line:
    continue

the output is

[{'AU': ['Uedo N', 'Kasiser R'],
  'ID': 'CN-01160769',
  'SO': 'United European Gastroenterology Journal',
  'TI': 'Development of an E-learning system'},
 {'AU': ['Krogh LQ'],
  'ID': 'CN-01070265',
  'SO': 'Resuscitation',
  'TI': 'E-learning in pediatric basic life support'}]
查看更多
登录 后发表回答