{"created":"2023-09-28T03:16:00.083877+00:00","id":2000052,"links":{},"metadata":{"_buckets":{"deposit":"3b062eb5-d5c2-42c4-b7b0-2227fc188e76"},"_deposit":{"created_by":16,"id":"2000052","owner":"16","owners":[16],"pid":{"revision_id":0,"type":"depid","value":"2000052"},"status":"published"},"_oai":{"id":"oai:muroran-it.repo.nii.ac.jp:02000052","sets":["216:241","46"]},"author_link":[],"item_79_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicIssueDates":{"bibliographicIssueDate":"2023-02-03","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"3","bibliographicNumberOfPages":"11","bibliographicPageStart":"1743","bibliographicVolumeNumber":"23","bibliographic_titles":[{"bibliographic_title":"Sensors","bibliographic_titleLang":"en"}]}]},"item_79_description_7":{"attribute_name":"抄録","attribute_value_mlt":[{"subitem_description":"The existing research on emotion recognition commonly uses mel spectrogram (MelSpec) and Geneva minimalistic acoustic parameter set (GeMAPS) as acoustic parameters to learn the audio features. MelSpec can represent the time-series variations of each frequency but cannot manage multiple types of audio features. On the other hand, GeMAPS can handle multiple audio features but fails to provide information on their time-series variations. Thus, this study proposes a speech emotion recognition model based on a multi-input deep neural network that simultaneously learns these two audio features. The proposed model comprises three parts, specifically, for learning MelSpec in image format, learning GeMAPS in vector format, and integrating them to predict the emotion. Additionally, a focal loss function is introduced to address the imbalanced data problem among the emotion classes. The results of the recognition experiments demonstrate weighted and unweighted accuracies of 0.6657 and 0.6149, respectively, which are higher than or comparable to those of the existing state-of-the-art methods. Overall, the proposed model significantly improves the recognition accuracy of the emotion “happiness”, which has been difficult to identify in previous studies owing to limited data. Therefore, the proposed model can effectively recognize emotions from speech and can be applied for practical purposes with future development.","subitem_description_language":"en","subitem_description_type":"Abstract"}]},"item_79_link_17":{"attribute_name":"出版者版へのリンク","attribute_value_mlt":[{"subitem_link_language":"ja","subitem_link_text":"10.3390/s23031743","subitem_link_url":"https://doi.org/10.3390/s23031743"}]},"item_79_publisher_11":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"MDPI","subitem_publisher_language":"en"}]},"item_79_relation_16":{"attribute_name":"PMID","attribute_value_mlt":[{"subitem_relation_type":"isIdenticalTo","subitem_relation_type_id":{"subitem_relation_type_id_text":"36772782","subitem_relation_type_select":"PMID"}}]},"item_79_relation_18":{"attribute_name":"DOI","attribute_value_mlt":[{"subitem_relation_type":"isIdenticalTo","subitem_relation_type_id":{"subitem_relation_type_id_text":"10.3390/s23031743","subitem_relation_type_select":"DOI"}}]},"item_79_rights_19":{"attribute_name":"権利","attribute_value_mlt":[{"subitem_rights":"© 2023 by the authors. Licensee MDPI","subitem_rights_language":"en"}]},"item_79_source_id_12":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"1424-8220","subitem_source_identifier_type":"EISSN"}]},"item_79_version_type_21":{"attribute_name":"著者版フラグ","attribute_value_mlt":[{"subitem_version_resource":"http://purl.org/coar/version/c_970fb48d4fbd8a85","subitem_version_type":"VoR"}]},"item_access_right":{"attribute_name":"アクセス権","attribute_value_mlt":[{"subitem_access_right":"open access","subitem_access_right_uri":"http://purl.org/coar/access_right/c_abf2"}]},"item_creator":{"attribute_name":"著者","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Toyoshima, Itsuki","creatorNameLang":"en"},{"creatorName":"豊島, 依槻","creatorNameLang":"ja"}]},{"creatorAffiliations":[{"affiliationNames":[{},{}]}],"creatorNames":[{"creatorName":"Okada, Yoshifumi","creatorNameLang":"en"},{"creatorName":"岡田, 吉史","creatorNameLang":"ja"}]},{"creatorNames":[{"creatorName":"Ishimaru, Momoko","creatorNameLang":"en"},{"creatorName":"石丸, 桃子","creatorNameLang":"ja"}]},{"creatorNames":[{"creatorName":"Uchiyama, Ryunosuke","creatorNameLang":"en"},{"creatorName":"内山, 竜之介","creatorNameLang":"ja"}]},{"creatorNames":[{"creatorName":"Tada, Mayu","creatorNameLang":"en"},{"creatorName":"多田, 真悠","creatorNameLang":"ja"}]}]},"item_files":{"attribute_name":"ファイル情報","attribute_type":"file","attribute_value_mlt":[{"accessrole":"open_access","date":[{"dateType":"Available","dateValue":"2023-09-28"}],"filename":"sensors-23-01743-v2.pdf","filesize":[{"value":"1 MB"}],"format":"application/pdf","licensetype":"license_0","url":{"url":"https://muroran-it.repo.nii.ac.jp/record/2000052/files/sensors-23-01743-v2.pdf"},"version_id":"1363af3b-1914-4ac8-a0bd-8a729c5c8135"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"multi-input deep neural network","subitem_subject_language":"en","subitem_subject_scheme":"Other"},{"subitem_subject":"speech emotion recognition","subitem_subject_language":"en","subitem_subject_scheme":"Other"},{"subitem_subject":"mel spectrogram","subitem_subject_language":"en","subitem_subject_scheme":"Other"},{"subitem_subject":"GeMAPS","subitem_subject_language":"en","subitem_subject_scheme":"Other"},{"subitem_subject":"focal loss function","subitem_subject_language":"en","subitem_subject_scheme":"Other"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourcetype":"journal article","resourceuri":"http://purl.org/coar/resource_type/c_6501"}]},"item_title":"Multi-Input Speech Emotion Recognition Model Using Mel Spectrogram and GeMAPS","item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"Multi-Input Speech Emotion Recognition Model Using Mel Spectrogram and GeMAPS","subitem_title_language":"en"}]},"item_type_id":"79","owner":"16","path":["46","241"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2023-09-28"},"publish_date":"2023-09-28","publish_status":"0","recid":"2000052","relation_version_is_last":true,"title":["Multi-Input Speech Emotion Recognition Model Using Mel Spectrogram and GeMAPS"],"weko_creator_id":"16","weko_shared_id":-1},"updated":"2023-09-28T03:34:53.788767+00:00"}