Searching in a .txt file and Comparing the two val

2019-06-08 17:24发布

问题:

"cadence_regulatable_result": "completeRecognition",
    "appserver_results": {
        "status": "success",
        "final_response": 0,
        "payload": {
            "actions": [{
                "speaker": "user",
                "type": "conversation",
                "nbest_text": {
                    "confidences": [478,
                    0,
                    0],
                    "words": [[{
                        "stime": 0,
                        "etime": 1710,
                        "word": "ConnectedDrive\\*no-space-before",
                        "confidence": "0.241"
                    }],
                    [{
                        "stime": 0,
                        "etime": 1020,
                        "word": "Connected\\*no-space-before",
                        "confidence": "0.0"
                    },
                    {
                        "stime": 1020,
                        "etime": 1710,
                        "word": "drive",
                        "confidence": "0.0"
                    }],
                    [{
                        "stime": 0,
                        "etime": 900,
                        "word": "Connect\\*no-space-before",
                        "confidence": "0.0"
                    },
                    {
                        "stime": 900,
                        "etime": 980,
                        "word": "to",
                        "confidence": "0.0"
                    },
                    {
                        "stime": 980,
                        "etime": 1710,
                        "word": "drive",
                        "confidence": "0.0"
                    }]],
                    "transcriptions"= ["ConnectedDrive",
                    "Connected drive",
                    "Connect to drive"]
                }
            }]
        }
    },
    "final_response": 0,
    "prompt": "",
    "result_format": "appserver_post_results"
}: form-data;name="QueryResult"Content-Type: application/JSON;charset=utf-8Nuance-Context: efb3d3ce-ef50-4e83-8c31-063c3f5208aa{
    "status_code": 0,
    "result_type": "DRAGON_NLU_ASR_CMD",
    "NMAS_PRFX_SESSION_ID": "f786f0be-d547-4fca-8d72-96429a30c9db",
    "NMAS_PRFX_TRANSACTION_ID": "1",
    "audio_transfer_info": {
        "packages": [{
            "time": "20151221085512579",
            "bytes": 1633
        },
        {
            "time": "20151221085512598",
            "bytes": 3969
        }],
        "nss_server": "10.56.11.186:4503",
        "end_time": "20151221085512596",
        "audio_id": 1,
        "start_time": "20151221085512303"
    },
    "cadence_regulatable_result": "completeRecognition",
    "appserver_results": {
        "status": "success",
        "final_response": 1,
        "payload": {
            "diagnostic_info": {
                "adk_dialog_manager_status": "undefined",
                "nlu_version": "[NLU_PROJECT:NVCCP-eng-USA];[D0160932];[VL-Models:Version: vl.1.100.12-2-GMT20151130160335]",
                "nlps_host": "mt-dmz-nlps002.nuance.com:8636",
                "nlps_ip": "10.56.10.51",
                "application": "AUDI_2017",
                "nlu_component_flow": "[Input:VoiceJSON] [FieldID|auto_main] [NLUlib|C-eckart-r$Rev$.f20151118.1250] [build|G-r72490M.f20151130.1055] [vlmodel|Version: 2-GMT20151130160335] [Flow|+VlingoTokenized]",
                "third_party_delay": "0",
                "nmaid": "AUDI_SDS_2017_EXT_20151203",
                "nlps_profile": "AUDI_2017",
                "fieldId": "auto_main",
                "nlps_profile_package_version": "r159218",
                "nlu_annotator": "com-GBR.ncs51.VlingoNLU-client-qNVCCP_NCS51",
                "ext_map_time": "2",
                "nlu_use_literal_annotator": "0",
                "int_map_time": "2",
                "nlps_nlu_type": "nlu_project",
                "nlu_language": "eng-GBR",
                "timing": {
                    "finalRespSentDelay": "188",
                    "intermediateRespSentDelay": "648"
                },
                "nlps_profile_package": "AUDI_2017"
            },
            "actions": [{
                "Input": {
                    "Interpretations": ["ConnectedDrive"],
                    "Type": "asr"
                },
                "Instances": [{
                    "nlu_classification": {
                        "Domain": "UDE",
                        "Intention": "Unspecified"
                    },
                    "nlu_interpretation_index": 1,
                    "nlu_slot_details": {
                        "Name": {
                            "literal": "ConnectedDrive"
                        },
                        "Search-phrase": {
                            "literal": "connecteddrive"
                        }
                    },
                    "interpretation_confidence": 4549
                }],
                "type": "nlu_results",
                "api_version": "1.0"
            }],
            "nlps_version": "nlps(z):6.1.100.12.2-B359;Version: nlps-base-GMT20151130193521;"
        }
    },

Firstly, I am searching for transcriptions and interpretations word in the .txt file (So I am using regex) then I want to compare the FIRST value of transcriptions ("Drive me to a charging station") with the Interpreations value ("Drive me to a charging station"). If I give as below in my program, it is just printing as Recognition is INVALID

directory =os.path.join("C:\Users\hemanth_venkatappa\Desktop\Working\pcm-audio\English")
for subdir, dirs, files in os.walk(directory): 
    for file in files:
        if file.endswith(".txt"): 
            content=json.load(file)
            if "status_code" in content:
                if content["status_code"]==0:
                    print("valid")

回答1:

You can take a look at difflib for text comparison using Python.

The difflib module contains tools for computing and working with differences between sequences. It is especially useful for comparing text, and includes functions that produce reports using several common difference formats.

difflib tutorial

Using this module you can evaluate differences between two strings or .txt files this way:

import difflib

a = ["Drive me to a charging station", "Drive me to charging station", "Drive me to a charging Station"]
correct = ["Drive me to a charging station"]

print difflib.SequenceMatcher(None, a[0], correct[0]).ratio()
>> 1.0

print difflib.SequenceMatcher(None, a[1], correct[0]).ratio()
>> 0.965517241379

print difflib.SequenceMatcher(None, a[2], correct[0]).ratio()
>> 0.966666666667

As you can see, the .ratio() between a[0] and correct is 1.0 or 100%. This means they're the same string.

You can use a loop to evaluate the ratios and if ratio == 1.0 then print "Recognition is VALID "

Also if you don't wanna use the .ratio() between the strings, you can check the differences using:

d = difflib.Differ()
diff = d.compare(a, correct)
print '\n'.join(diff)

And this block of code gives me:

  Drive me to a charging          # no signal at the start means it's the same string
- Drive me to charging station    # this string has less chars than the expected string
- Drive me to a charging Station  # same here

Then you'll have to figure a way to print Recognition is VALID or INVALID according to your expectations.



回答2:

This seems to be JSON. You should be able to load the whole file into a dictionary with:

import json
data = json.load(f)

Now data contains a dictionary of other dictionaries and lists. You need to find your way through by exploring the dictionary.

Similar to this:

interpretations = data["appserver_results"]["actions"][0]["Input"]["Interpretations"] 
transcriptions = (data["cadence_regulatable_result"]["completeRecognition"]["appserver_results"]
                  ["payload"]["actions"][0]["nbest_text"]["transcriptions"])

You need to adjust to your real data. Play around at the interactive prompt to find find out what keys and indices you need to use.

Now you check if it is contained:

if interpretations[0] in transcriptions:
    print('found', interpretations[0] )

You final program can look similar to this:

def find_interpretations(fobj):
    data = json.load(fobj)
    interpretations = data["appserver_results"]["actions"][0]["Input"]["Interpretations"] 
    transcriptions = (data["cadence_regulatable_result"]["completeRecognition"]["appserver_results"]
              ["payload"]["actions"][0]["nbest_text"]["transcriptions"])
    if interpretations[0] in transcriptions:
        return interpretations[0]
    return None

for subdir, dirs, files in os.walk(directory): 
    for file in files:
        if file.endswith(".txt"): 
            file_name = os.path.join(subdir, file)
            with open(file_name) as fobj:
                found = find_interpretations(fobj)
                if found:
                    print('found: {} in file: {}'.format(found, file_name)


回答3:

Since the tries with difflib and json led you nowhere, this is based on your original approach from revision 2 of your question; it basically just uses re.search instead of re.findall to check whether the first transcription equals the interpretation:

#!/usr/bin/env python3
import os
import re
directory = os.path.join("../data/English")
for subdir, dirs, files in os.walk(directory): 
    for file in files:
        if file.endswith(".txt"): 
            f = open(os.path.join(subdir, file),'r')
            a = f.read() 
            if re.findall('\"status_code\": 0', a):
                print('Status is Valid') 
            else:
                print('Status is Invalid')
            m = re.search('"transcriptions"= ."(.*)"', a)
            if m and re.search('"Interpretations": ."'+m.group(1), a):
                print('Recognition is VALID')
            else:
                print('Recognition is INVALID')