@conference{10902/36330,
year = {2024},
url = {https://hdl.handle.net/10902/36330},
abstract = {The growing use of AI -driven video applications like surveillance or healthcare monitoring underscores the need for embedded solutions capable of accurately categorizing human actions in real-time videos. A methodology is proposed for implementing a customized CNN-LSTM architecture on AMD-Xilinx SoC FPGA devices for human action categorization from video data. In this approach, CNN operations are accelerated by the Vitis-AI DPU within the FPG A, offering flexibility to support a range of CNN architectures without requiring individual hardware description language development. This adaptability is crucial given the varying performance of CNN models across datasets. LSTM operations are executed on the SoC processors, overcoming limitations in the support provided by DPU IP cores for such networks, while maintaining flexibility to assess different configurations. Additionally, a pipeline strategy is proposed to enable parallel execution of both CNN and LSTM components, optimizing resource utilization and minimizing idle times. To demonstrate the validity of the proposed implementation methodology, experiments were conducted on the ZCUI02 de-velopment board, equipped with a Zynq Ultrascale+ MP-SoC, and involved the use of the VGG 16 CNN model along with the exploration of different LSTM configurations. The results demonstrate remarkable computational performance, achieving frame rates of up to 44.34 FPS for videos recorded at a resolution of 320×240 pixels, surpassing real-time requirements. Aditionally, the proposed implementation maintains high accuracy levels, exemplified by the single bidirectional LSTM layer achieving a competitive accuracy of 73.33% based on the UCF10l dataset.},
organization = {This work has been supported by Project PID2020-116417RB-C43, funded by Spanish MCIN/AEI/10.13039/501100011033 and by Project No 101007273 ECSEL DAIS, funded by EU H2020 and by Spanish pci2021-121988.},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
publisher = {27th Euromicro Conference on Digital System Design (DSD), París, 2024, 202-209},
title = {CNN-LSTM implementation methodology on SoC FPGA for human action recognition based on video},
author = {Suárez Plata, Daniel Nicolás and Fernández Solórzano, Víctor Manuel and Posadas Cobo, Héctor},
}