For Watson's Speech-To-Text Unity SDK, how can

2019-05-11 08:59发布

I am trying to specify keywords in Watson's Speech-To-Text Unity SDK, but I'm unsure how to do this.

The details page doesn't show an example (see here: https://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml),

and other forum posts are written for Java applications (see here: How to specify phonetic keywords for IBM Watson speech2text service?).

I've tried hard-coding these values in the RecognizeRequest class created in the "Recognize" function like so, but without success:

**EDIT - this function never gets called -- **

public bool Recognize(AudioClip clip, OnRecognize callback)
    {
        if (clip == null)
            throw new ArgumentNullException("clip");
        if (callback == null)
            throw new ArgumentNullException("callback");

        RESTConnector connector = RESTConnector.GetConnector(SERVICE_ID, "/v1/recognize");
        if (connector == null)
            return false;

        RecognizeRequest req = new RecognizeRequest();
        req.Clip = clip;
        req.Callback = callback;

        req.Headers["Content-Type"] = "audio/wav";
        req.Send = WaveFile.CreateWAV(clip);
        if (req.Send.Length > MAX_RECOGNIZE_CLIP_SIZE)
        {
            Log.Error("SpeechToText", "AudioClip is too large for Recognize().");
            return false;
        }
        req.Parameters["model"] = m_RecognizeModel;
        req.Parameters["continuous"] = "false";
        req.Parameters["max_alternatives"] = m_MaxAlternatives.ToString();
        req.Parameters["timestamps"] = m_Timestamps ? "true" : "false";
        req.Parameters["word_confidence"] = m_WordConfidence ? "true" :false";

        //these "keywords" and "keywords_threshold" and "keywordsThreshold" parameters
        //are just my guess for how to set these values            
        req.Parameters["keywords"] = new string[] {"fun", "match", "test" };
        req.Parameters["keywordsThreshold"] = .2;
        req.Parameters["keywords_threshold"] = .2;
        //end my test insertions

        req.OnResponse = OnRecognizeResponse;

        return connector.Send(req);
    }

but the returned SpeechRecognitionEvent result value does not contain any keywords_result. This is my aim. I'm trying to view the confidence for each keyword in the keywords_result object like so, but the keywords_result object comes back as null.

private void OnRecognize(SpeechRecognitionEvent result) {
    Debug.Log("Recognizing!");
    m_ResultOutput.SendData(new SpeechToTextData(result));

    if (result != null && result.results.Length > 0) {
        if (m_Transcript != null)
            m_Transcript.text = "";

        foreach (var res in result.results) {
            //the res.keywords_result comes back as null
            foreach (var keyword in res.keywords_result.keyword) {
                string text = keyword.normalized_text;
                float confidence = keyword.confidence;
                Debug.Log(text + ": " + confidence);                                            
            }
        }
    }
}

Has anyone successfully implemented Keyword Confidence Evaluation with Watson's Speech-To-Text SDK in Unity or C#? All ideas and suggestions are welcome.

PS This is my first post :)

1条回答
我想做一个坏孩纸
2楼-- · 2019-05-11 09:35

Turns out I needed to specify the keywords in the "SendStart" function like so:

private void SendStart() {
        if (m_ListenSocket == null)
            throw new WatsonException("SendStart() called with null connector.");

        Dictionary<string, object> start = new Dictionary<string, object>();
        start["action"] = "start";
        start["content-type"] = "audio/l16;rate=" + m_RecordingHZ.ToString() + ";channels=1;";
        start["continuous"] = EnableContinousRecognition;
        start["max_alternatives"] = m_MaxAlternatives;
        start["interim_results"] = EnableInterimResults;
        start["word_confidence"] = m_WordConfidence;
        start["timestamps"] = m_Timestamps;

        //specify keywords here
        start["keywords"] = keywordsToCheck.ToArray();
        start["keywords_threshold"] = 0.05;
        //end additions here 

        m_ListenSocket.Send(new WSConnector.TextMessage(Json.Serialize(start)));
        m_LastStartSent = DateTime.Now;
    }

and write some code to parse the keyword_results properly in the "ParseRecognizeResponse" function:

private SpeechRecognitionEvent ParseRecognizeResponse(IDictionary resp){

        if (resp == null)
            return null;


        List<SpeechRecognitionResult> results = new List<SpeechRecognitionResult>();
        IList iresults = resp["results"] as IList;
        if (iresults == null)
            return null;

        foreach (var r in iresults)
        {
            IDictionary iresult = r as IDictionary;
            if (iresults == null)
                continue;

            SpeechRecognitionResult result = new SpeechRecognitionResult();

            //added this section, starting here
            IDictionary iKeywords_result = iresult["keywords_result"] as IDictionary;
            result.keywords_result = new KeywordResults();
            List<KeywordResult> keywordResults = new List<KeywordResult>();
            foreach (string key in keywordsToCheck) {
                if (iKeywords_result[key] != null) {
                    IList keyword_Results = iKeywords_result[key] as IList;
                    if (keyword_Results == null) {
                        continue;
                    }
                    foreach (var res in keyword_Results) {
                        IDictionary kw_resultDic = res as IDictionary;
                        KeywordResult keyword_Result = new KeywordResult();
                        keyword_Result.confidence = (double)kw_resultDic["confidence"];
                        keyword_Result.end_time = (double)kw_resultDic["end_time"];
                        keyword_Result.start_time = (double)kw_resultDic["start_time"];
                        keyword_Result.normalized_text = (string)kw_resultDic["normalized_text"];
                        keywordResults.Add(keyword_Result);
                    }
                }
            }
            result.keywords_result.keyword = keywordResults.ToArray();                   
            //ends here

            result.final = (bool)iresult["final"];

            IList ialternatives = iresult["alternatives"] as IList;
            if (ialternatives == null)
                continue;

            List<SpeechRecognitionAlternative> alternatives = new List<SpeechRecognitionAlternative>();
            foreach (var a in ialternatives)
            {
                IDictionary ialternative = a as IDictionary;
                if (ialternative == null)
                    continue;

                SpeechRecognitionAlternative alternative = new SpeechRecognitionAlternative();
                alternative.transcript = (string)ialternative["transcript"];
                if (ialternative.Contains("confidence"))
                    alternative.confidence = (double)ialternative["confidence"];

                if (ialternative.Contains("timestamps"))
                {
                    IList itimestamps = ialternative["timestamps"] as IList;

                    TimeStamp[] timestamps = new TimeStamp[itimestamps.Count];
                    for (int i = 0; i < itimestamps.Count; ++i)
                    {
                        IList itimestamp = itimestamps[i] as IList;
                        if (itimestamp == null)
                            continue;

                        TimeStamp ts = new TimeStamp();
                        ts.Word = (string)itimestamp[0];
                        ts.Start = (double)itimestamp[1];
                        ts.End = (double)itimestamp[2];
                        timestamps[i] = ts;
                    }

                    alternative.Timestamps = timestamps;
                }
                if (ialternative.Contains("word_confidence"))
                {
                    IList iconfidence = ialternative["word_confidence"] as IList;

                    WordConfidence[] confidence = new WordConfidence[iconfidence.Count];
                    for (int i = 0; i < iconfidence.Count; ++i)
                    {
                        IList iwordconf = iconfidence[i] as IList;
                        if (iwordconf == null)
                            continue;

                        WordConfidence wc = new WordConfidence();
                        wc.Word = (string)iwordconf[0];
                        wc.Confidence = (double)iwordconf[1];
                        confidence[i] = wc;
                    }

                    alternative.WordConfidence = confidence;
                }

                alternatives.Add(alternative);
            }
            result.alternatives = alternatives.ToArray();
            results.Add(result);
        }

        return new SpeechRecognitionEvent(results.ToArray());                        
    }

So that now, when OnRecognize gets passed this SpeechRecognitionEvent, I've changed the code for displaying word alternatives and their confidence score, to displaying keyword results and their confidence score, like so:

private void OnRecognize(SpeechRecognitionEvent result) {
    //Debug.Log("Recognizing!");
    m_ResultOutput.SendData(new SpeechToTextData(result));

    if (result != null && result.results.Length > 0) {
        if (m_Transcript != null)
            m_Transcript.text = "";

        foreach (var res in result.results) {
            //start keyword recognition changes here
            if (res.keywords_result != null) {
                if (res.keywords_result.keyword != null) {
                    foreach (var keyword in res.keywords_result.keyword) {
                        m_Transcript.text += string.Format("{0} ({1}, {2:0.00})\n",
                            keyword.normalized_text, res.final ? "Final" : "Interim", keyword.confidence);
                    }
                }
            }
            //end here                
        }
    }
}

Note, using the keyword results confidence values is much more valuable than doing some hardcoded check to see if the word alternatives Watson is getting match your keywords, and then using the confidence value there. The confidence values come back much higher when checking the keyword_results.keyword[].confidence values because it's already checking against those words. That was the impetus for going through with this process and parsing the SpeechRecognitionEvent result value to properly include the keywords_result values.

For some background, I'm creating a rhythm game for children with dyslexia to learn word formation, so think Guitar Hero meets Sesame street.

查看更多
登录 后发表回答