From af96b559e637dd8f5eaa4ee702ea8d2aec63d371 Mon Sep 17 00:00:00 2001 From: Adrian Iain Lam Date: Sun, 16 May 2021 02:59:36 +0100 Subject: [PATCH] Remove references to dlib and OpenCV. Added TODOs to use with OSF. Replaced some dlib types with generic structs. Code compiles now but does not work because all coordinates are zero. --- CMakeLists.txt | 8 +- config.txt | 38 ++---- include/facial_landmark_detector.h | 47 ++++--- src/facial_landmark_detector.cpp | 255 ++++++++++++------------------------- src/math_utils.h | 18 +-- 5 files changed, 124 insertions(+), 242 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb72aa8..949c6da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,13 +2,9 @@ cmake_minimum_required(VERSION 3.16) project(FacialLandmarksForCubism_project) -add_subdirectory(lib/dlib/dlib dlib_build) -find_package(OpenCV REQUIRED) -include_directories(${OpenCV_INCLUDE_DIRS}) - add_library(FacialLandmarksForCubism STATIC src/facial_landmark_detector.cpp) set_target_properties(FacialLandmarksForCubism PROPERTIES PUBLIC_HEADER include/facial_landmark_detector.h) -target_include_directories(FacialLandmarksForCubism PRIVATE include lib/dlib) -target_link_libraries(FacialLandmarksForCubism ${OpenCV_LIBS} dlib::dlib) +target_include_directories(FacialLandmarksForCubism PRIVATE include) +target_link_libraries(FacialLandmarksForCubism) diff --git a/config.txt b/config.txt index 9c28cee..9830251 100644 --- a/config.txt +++ b/config.txt @@ -8,37 +8,17 @@ # very first character of the line, i.e. without any preceeding whitespace. -## Section 1: dlib face detection and webcam parameters - -# Path to the dlib shape predictor trained dataset -predictorPath ./shape_predictor_68_face_landmarks.dat - -# Value passed to the cv::VideoCapture() ctor -cvVideoCaptureId 0 - -# Number of milliseconds to wait after processing each video frame -# This value controls the frame rate, but the actual frame period -# is longer due to the time required to process each frame -cvWaitKeyMs 5 - -# If 1, show the webcam captured video on-screen; if 0, don't show -showWebcamVideo 1 - -# If 1, draw the detected facial landmarks on-screen; if 0, don't draw -renderLandmarksOnVideo 1 - -# If 1, laterally invert the image (create a mirror image); if 0, don't invert -lateralInversion 1 - - -## Section 2: Cubism params calculation control +## Section 1: Cubism params calculation control # # These values control how the facial landmarks are translated into # parameters that control the Cubism model, and will vary from person # to person. The following values seem to work OK for my face, but # your milage may vary. -# Section 2.0: Live2D automatic functionality +# If 1, laterally invert the image (create a mirror image); if 0, don't invert +lateralInversion 1 + +# Section 1.0: Live2D automatic functionality # Set 1 to enable, 0 to disable. # If these are set, the automatic functionality in Live2D will be enabled. # Note: If you set auto blink, eye control will be disabled. @@ -46,7 +26,7 @@ autoBlink 0 autoBreath 0 randomMotion 0 -# Section 2.1: Face Y direction angle (head pointing up/down) +# Section 1.1: Face Y direction angle (head pointing up/down) # The Y angle is calculated mainly based on the angle formed # by the corners and the tip of the nose (hereafter referred # to as the "nose angle"). @@ -78,7 +58,7 @@ faceYAngleXRotCorrection 0.15 faceYAngleSmileCorrection 0.075 -# Section 2.2: Eye control +# Section 1.2: Eye control # This is mainly calculated based on the eye aspect ratio (eye height # divided by eye width). Note that currently an average of the values # of both eyes is applied - mainly due to two reasons: (1) the dlib @@ -103,7 +83,7 @@ eyeSmileMouthFormThreshold 0.75 eyeSmileMouthOpenThreshold 0.5 -# Section 2.3: Mouth control +# Section 1.3: Mouth control # Two parameters are passed to Cubism to control the mouth: # - mouth form: Controls smiles / laughs # - mouth openness: How widely open the mouth is @@ -131,7 +111,7 @@ mouthOpenThreshold 0.4 mouthOpenLaughCorrection 0.2 -## Section 3: Filtering parameters +## Section 2: Filtering parameters # The facial landmark coordinates can be quite noisy, so I've applied # a simple moving average filter to reduce noise. More taps would mean # more samples to average over, hence smoother movements with less noise, diff --git a/include/facial_landmark_detector.h b/include/facial_landmark_detector.h index 5163fe9..85f3512 100644 --- a/include/facial_landmark_detector.h +++ b/include/facial_landmark_detector.h @@ -27,10 +27,18 @@ SOFTWARE. #include #include -#include -#include -#include -#include + +struct Point +{ + double x; + double y; + + Point(double _x = 0, double _y = 0) + { + x = _x; + y = _y; + } +}; class FacialLandmarkDetector { @@ -69,29 +77,25 @@ private: RIGHT }; - cv::VideoCapture webcam; - dlib::image_window win; - dlib::frontal_face_detector detector; - dlib::shape_predictor predictor; bool m_stop; - double calcEyeAspectRatio(dlib::point& p1, dlib::point& p2, - dlib::point& p3, dlib::point& p4, - dlib::point& p5, dlib::point& p6) const; + double calcEyeAspectRatio(Point& p1, Point& p2, + Point& p3, Point& p4, + Point& p5, Point& p6) const; - double calcRightEyeAspectRatio(dlib::full_object_detection& shape) const; - double calcLeftEyeAspectRatio(dlib::full_object_detection& shape) const; + double calcRightEyeAspectRatio(Point landmarks[]) const; + double calcLeftEyeAspectRatio(Point landmarks[]) const; double calcEyeOpenness(LeftRight eye, - dlib::full_object_detection& shape, + Point landmarks[], double faceYAngle) const; - double calcMouthForm(dlib::full_object_detection& shape) const; - double calcMouthOpenness(dlib::full_object_detection& shape, double mouthForm) const; + double calcMouthForm(Point landmarks[]) const; + double calcMouthOpenness(Point landmarks[], double mouthForm) const; - double calcFaceXAngle(dlib::full_object_detection& shape) const; - double calcFaceYAngle(dlib::full_object_detection& shape, double faceXAngle, double mouthForm) const; - double calcFaceZAngle(dlib::full_object_detection& shape) const; + double calcFaceXAngle(Point landmarks[]) const; + double calcFaceYAngle(Point landmarks[], double faceXAngle, double mouthForm) const; + double calcFaceZAngle(Point landmarks[]) const; void populateDefaultConfig(void); void parseConfig(std::string cfgPath); @@ -111,14 +115,10 @@ private: struct Config { - int cvVideoCaptureId; - std::string predictorPath; double faceYAngleCorrection; double eyeSmileEyeOpenThreshold; double eyeSmileMouthFormThreshold; double eyeSmileMouthOpenThreshold; - bool showWebcamVideo; - bool renderLandmarksOnVideo; bool lateralInversion; std::size_t faceXAngleNumTaps; std::size_t faceYAngleNumTaps; @@ -127,7 +127,6 @@ private: std::size_t mouthOpenNumTaps; std::size_t leftEyeOpenNumTaps; std::size_t rightEyeOpenNumTaps; - int cvWaitKeyMs; double eyeClosedThreshold; double eyeOpenThreshold; double mouthNormalThreshold; diff --git a/src/facial_landmark_detector.cpp b/src/facial_landmark_detector.cpp index fb536b6..da24999 100644 --- a/src/facial_landmark_detector.cpp +++ b/src/facial_landmark_detector.cpp @@ -26,13 +26,6 @@ SOFTWARE. #include #include -#include - -#include -#include -#include -#include - #include "facial_landmark_detector.h" #include "math_utils.h" @@ -52,13 +45,7 @@ FacialLandmarkDetector::FacialLandmarkDetector(std::string cfgPath) { parseConfig(cfgPath); - if (!webcam.open(m_cfg.cvVideoCaptureId)) - { - throw std::runtime_error("Unable to open webcam"); - } - - detector = dlib::get_frontal_face_detector(); - dlib::deserialize(m_cfg.predictorPath) >> predictor; + // TODO setup UDP connection here? } FacialLandmarkDetector::Params FacialLandmarkDetector::getParams(void) const @@ -127,92 +114,57 @@ void FacialLandmarkDetector::mainLoop(void) { while (!m_stop) { - cv::Mat frame; - if (!webcam.read(frame)) - { - throw std::runtime_error("Unable to read from webcam"); - } - cv::Mat flipped; if (m_cfg.lateralInversion) { - cv::flip(frame, flipped, 1); + // TODO is it something we can do here? Or in OSF only? } - else - { - flipped = frame; - } - dlib::cv_image cimg(flipped); - if (m_cfg.showWebcamVideo) - { - win.set_image(cimg); - } + // TODO get the array of landmark coordinates here + Point landmarks[68]; - std::vector faces = detector(cimg); - if (faces.size() > 0) - { - dlib::rectangle face = faces[0]; - dlib::full_object_detection shape = predictor(cimg, face); - - /* The coordinates seem to be rather noisy in general. - * We will push everything through some moving average filters - * to reduce noise. The number of taps is determined empirically - * until we get something good. - * An alternative method would be to get some better dataset - * for dlib - perhaps even to train on a custom data set just for the user. - */ - - // Face rotation: X direction (left-right) - double faceXRot = calcFaceXAngle(shape); - filterPush(m_faceXAngle, faceXRot, m_cfg.faceXAngleNumTaps); - - // Mouth form (smile / laugh) detection - double mouthForm = calcMouthForm(shape); - filterPush(m_mouthForm, mouthForm, m_cfg.mouthFormNumTaps); - - // Face rotation: Y direction (up-down) - double faceYRot = calcFaceYAngle(shape, faceXRot, mouthForm); - filterPush(m_faceYAngle, faceYRot, m_cfg.faceYAngleNumTaps); - - // Face rotation: Z direction (head tilt) - double faceZRot = calcFaceZAngle(shape); - filterPush(m_faceZAngle, faceZRot, m_cfg.faceZAngleNumTaps); - - // Mouth openness - double mouthOpen = calcMouthOpenness(shape, mouthForm); - filterPush(m_mouthOpenness, mouthOpen, m_cfg.mouthOpenNumTaps); - - // Eye openness - double eyeLeftOpen = calcEyeOpenness(LEFT, shape, faceYRot); - filterPush(m_leftEyeOpenness, eyeLeftOpen, m_cfg.leftEyeOpenNumTaps); - double eyeRightOpen = calcEyeOpenness(RIGHT, shape, faceYRot); - filterPush(m_rightEyeOpenness, eyeRightOpen, m_cfg.rightEyeOpenNumTaps); - - // TODO eyebrows? - - if (m_cfg.showWebcamVideo && m_cfg.renderLandmarksOnVideo) - { - win.clear_overlay(); - win.add_overlay(dlib::render_face_detections(shape)); - } - } - else - { - if (m_cfg.showWebcamVideo && m_cfg.renderLandmarksOnVideo) - { - win.clear_overlay(); - } - } + /* The coordinates seem to be rather noisy in general. + * We will push everything through some moving average filters + * to reduce noise. The number of taps is determined empirically + * until we get something good. + * An alternative method would be to get some better dataset - + * perhaps even to train on a custom data set just for the user. + */ + + // Face rotation: X direction (left-right) + double faceXRot = calcFaceXAngle(landmarks); + filterPush(m_faceXAngle, faceXRot, m_cfg.faceXAngleNumTaps); - cv::waitKey(m_cfg.cvWaitKeyMs); + // Mouth form (smile / laugh) detection + double mouthForm = calcMouthForm(landmarks); + filterPush(m_mouthForm, mouthForm, m_cfg.mouthFormNumTaps); + + // Face rotation: Y direction (up-down) + double faceYRot = calcFaceYAngle(landmarks, faceXRot, mouthForm); + filterPush(m_faceYAngle, faceYRot, m_cfg.faceYAngleNumTaps); + + // Face rotation: Z direction (head tilt) + double faceZRot = calcFaceZAngle(landmarks); + filterPush(m_faceZAngle, faceZRot, m_cfg.faceZAngleNumTaps); + + // Mouth openness + double mouthOpen = calcMouthOpenness(landmarks, mouthForm); + filterPush(m_mouthOpenness, mouthOpen, m_cfg.mouthOpenNumTaps); + + // Eye openness + double eyeLeftOpen = calcEyeOpenness(LEFT, landmarks, faceYRot); + filterPush(m_leftEyeOpenness, eyeLeftOpen, m_cfg.leftEyeOpenNumTaps); + double eyeRightOpen = calcEyeOpenness(RIGHT, landmarks, faceYRot); + filterPush(m_rightEyeOpenness, eyeRightOpen, m_cfg.rightEyeOpenNumTaps); + + // TODO eyebrows? } } double FacialLandmarkDetector::calcEyeAspectRatio( - dlib::point& p1, dlib::point& p2, - dlib::point& p3, dlib::point& p4, - dlib::point& p5, dlib::point& p6) const + Point& p1, Point& p2, + Point& p3, Point& p4, + Point& p5, Point& p6) const { double eyeWidth = dist(p1, p4); double eyeHeight1 = dist(p2, p6); @@ -223,19 +175,19 @@ double FacialLandmarkDetector::calcEyeAspectRatio( double FacialLandmarkDetector::calcEyeOpenness( LeftRight eye, - dlib::full_object_detection& shape, + Point landmarks[], double faceYAngle) const { double eyeAspectRatio; if (eye == LEFT) { - eyeAspectRatio = calcEyeAspectRatio(shape.part(42), shape.part(43), shape.part(44), - shape.part(45), shape.part(46), shape.part(47)); + eyeAspectRatio = calcEyeAspectRatio(landmarks[42], landmarks[43], landmarks[44], + landmarks[45], landmarks[46], landmarks[47]); } else { - eyeAspectRatio = calcEyeAspectRatio(shape.part(36), shape.part(37), shape.part(38), - shape.part(39), shape.part(40), shape.part(41)); + eyeAspectRatio = calcEyeAspectRatio(landmarks[36], landmarks[37], landmarks[38], + landmarks[39], landmarks[40], landmarks[41]); } // Apply correction due to faceYAngle @@ -246,7 +198,7 @@ double FacialLandmarkDetector::calcEyeOpenness( -double FacialLandmarkDetector::calcMouthForm(dlib::full_object_detection& shape) const +double FacialLandmarkDetector::calcMouthForm(Point landmarks[]) const { /* Mouth form parameter: 0 for normal mouth, 1 for fully smiling / laughing. * Compare distance between the two corners of the mouth @@ -260,12 +212,12 @@ double FacialLandmarkDetector::calcMouthForm(dlib::full_object_detection& shape) * the angle changes. So here we'll use the distance approach instead. */ - auto eye1 = centroid(shape.part(36), shape.part(37), shape.part(38), - shape.part(39), shape.part(40), shape.part(41)); - auto eye2 = centroid(shape.part(42), shape.part(43), shape.part(44), - shape.part(45), shape.part(46), shape.part(47)); + auto eye1 = centroid(landmarks[36], landmarks[37], landmarks[38], + landmarks[39], landmarks[40], landmarks[41]); + auto eye2 = centroid(landmarks[42], landmarks[43], landmarks[44], + landmarks[45], landmarks[46], landmarks[47]); double distEyes = dist(eye1, eye2); - double distMouth = dist(shape.part(48), shape.part(54)); + double distMouth = dist(landmarks[58], landmarks[62]); double form = linearScale01(distMouth / distEyes, m_cfg.mouthNormalThreshold, @@ -275,21 +227,21 @@ double FacialLandmarkDetector::calcMouthForm(dlib::full_object_detection& shape) } double FacialLandmarkDetector::calcMouthOpenness( - dlib::full_object_detection& shape, + Point landmarks[], double mouthForm) const { // Use points for the bottom of the upper lip, and top of the lower lip // We have 3 pairs of points available, which give the mouth height // on the left, in the middle, and on the right, resp. // First let's try to use an average of all three. - double heightLeft = dist(shape.part(63), shape.part(65)); - double heightMiddle = dist(shape.part(62), shape.part(66)); - double heightRight = dist(shape.part(61), shape.part(67)); + double heightLeft = dist(landmarks[61], landmarks[63]); + double heightMiddle = dist(landmarks[60], landmarks[64]); + double heightRight = dist(landmarks[59], landmarks[65]); double avgHeight = (heightLeft + heightMiddle + heightRight) / 3; // Now, normalize it with the width of the mouth. - double width = dist(shape.part(60), shape.part(64)); + double width = dist(landmarks[58], landmarks[62]); double normalized = avgHeight / width; @@ -305,24 +257,24 @@ double FacialLandmarkDetector::calcMouthOpenness( return scaled; } -double FacialLandmarkDetector::calcFaceXAngle(dlib::full_object_detection& shape) const +double FacialLandmarkDetector::calcFaceXAngle(Point landmarks[]) const { // This function will be easier to understand if you refer to the // diagram in faceXAngle.png // Construct the y-axis using (1) average of four points on the nose and - // (2) average of four points on the upper lip. + // (2) average of five points on the upper lip. - auto y0 = centroid(shape.part(27), shape.part(28), shape.part(29), - shape.part(30)); - auto y1 = centroid(shape.part(50), shape.part(51), shape.part(52), - shape.part(62)); + auto y0 = centroid(landmarks[27], landmarks[28], landmarks[29], + landmarks[30]); + auto y1 = centroid(landmarks[48], landmarks[49], landmarks[50], + landmarks[51], landmarks[52]); // Now drop a perpedicular from the left and right edges of the face, // and calculate the ratio between the lengths of these perpendiculars - auto left = centroid(shape.part(14), shape.part(15), shape.part(16)); - auto right = centroid(shape.part(0), shape.part(1), shape.part(2)); + auto left = centroid(landmarks[14], landmarks[15], landmarks[16]); + auto right = centroid(landmarks[0], landmarks[1], landmarks[2]); // Constructing a perpendicular: // Join the left/right point and the upper lip. The included angle @@ -349,13 +301,13 @@ double FacialLandmarkDetector::calcFaceXAngle(dlib::full_object_detection& shape return theta; } -double FacialLandmarkDetector::calcFaceYAngle(dlib::full_object_detection& shape, double faceXAngle, double mouthForm) const +double FacialLandmarkDetector::calcFaceYAngle(Point landmarks[], double faceXAngle, double mouthForm) const { // Use the nose // angle between the two left/right points and the tip - double c = dist(shape.part(31), shape.part(35)); - double a = dist(shape.part(30), shape.part(31)); - double b = dist(shape.part(30), shape.part(35)); + double c = dist(landmarks[31], landmarks[35]); + double a = dist(landmarks[30], landmarks[31]); + double b = dist(landmarks[30], landmarks[35]); double angle = solveCosineRuleAngle(c, a, b); @@ -387,25 +339,25 @@ double FacialLandmarkDetector::calcFaceYAngle(dlib::full_object_detection& shape } } -double FacialLandmarkDetector::calcFaceZAngle(dlib::full_object_detection& shape) const +double FacialLandmarkDetector::calcFaceZAngle(Point landmarks[]) const { // Use average of eyes and nose - auto eyeRight = centroid(shape.part(36), shape.part(37), shape.part(38), - shape.part(39), shape.part(40), shape.part(41)); - auto eyeLeft = centroid(shape.part(42), shape.part(43), shape.part(44), - shape.part(45), shape.part(46), shape.part(47)); + auto eyeRight = centroid(landmarks[36], landmarks[37], landmarks[38], + landmarks[39], landmarks[40], landmarks[41]); + auto eyeLeft = centroid(landmarks[42], landmarks[43], landmarks[44], + landmarks[45], landmarks[46], landmarks[47]); - auto noseLeft = shape.part(35); - auto noseRight = shape.part(31); + auto noseLeft = landmarks[35]; + auto noseRight = landmarks[31]; - double eyeYDiff = eyeRight.y() - eyeLeft.y(); - double eyeXDiff = eyeRight.x() - eyeLeft.x(); + double eyeYDiff = eyeRight.y - eyeLeft.y; + double eyeXDiff = eyeRight.x - eyeLeft.x; double angle1 = std::atan(eyeYDiff / eyeXDiff); - double noseYDiff = noseRight.y() - noseLeft.y(); - double noseXDiff = noseRight.x() - noseLeft.x(); + double noseYDiff = noseRight.y - noseLeft.y; + double noseXDiff = noseRight.x - noseLeft.x; double angle2 = std::atan(noseYDiff / noseXDiff); @@ -440,23 +392,7 @@ void FacialLandmarkDetector::parseConfig(std::string cfgPath) std::string paramName; if (ss >> paramName) { - if (paramName == "cvVideoCaptureId") - { - if (!(ss >> m_cfg.cvVideoCaptureId)) - { - throwConfigError(paramName, "int", - line, lineNum); - } - } - else if (paramName == "predictorPath") - { - if (!(ss >> m_cfg.predictorPath)) - { - throwConfigError(paramName, "std::string", - line, lineNum); - } - } - else if (paramName == "faceYAngleCorrection") + if (paramName == "faceYAngleCorrection") { if (!(ss >> m_cfg.faceYAngleCorrection)) { @@ -488,22 +424,6 @@ void FacialLandmarkDetector::parseConfig(std::string cfgPath) line, lineNum); } } - else if (paramName == "showWebcamVideo") - { - if (!(ss >> m_cfg.showWebcamVideo)) - { - throwConfigError(paramName, "bool", - line, lineNum); - } - } - else if (paramName == "renderLandmarksOnVideo") - { - if (!(ss >> m_cfg.renderLandmarksOnVideo)) - { - throwConfigError(paramName, "bool", - line, lineNum); - } - } else if (paramName == "lateralInversion") { if (!(ss >> m_cfg.lateralInversion)) @@ -568,14 +488,6 @@ void FacialLandmarkDetector::parseConfig(std::string cfgPath) line, lineNum); } } - else if (paramName == "cvWaitKeyMs") - { - if (!(ss >> m_cfg.cvWaitKeyMs)) - { - throwConfigError(paramName, "int", - line, lineNum); - } - } else if (paramName == "eyeClosedThreshold") { if (!(ss >> m_cfg.eyeClosedThreshold)) @@ -713,16 +625,11 @@ void FacialLandmarkDetector::populateDefaultConfig(void) // These are values that I've personally tested to work OK for my face. // Your milage may vary - hence the config file. - m_cfg.cvVideoCaptureId = 0; - m_cfg.predictorPath = "shape_predictor_68_face_landmarks.dat"; m_cfg.faceYAngleCorrection = 10; m_cfg.eyeSmileEyeOpenThreshold = 0.6; m_cfg.eyeSmileMouthFormThreshold = 0.75; m_cfg.eyeSmileMouthOpenThreshold = 0.5; - m_cfg.showWebcamVideo = true; - m_cfg.renderLandmarksOnVideo = true; m_cfg.lateralInversion = true; - m_cfg.cvWaitKeyMs = 5; m_cfg.faceXAngleNumTaps = 11; m_cfg.faceYAngleNumTaps = 11; m_cfg.faceZAngleNumTaps = 11; diff --git a/src/math_utils.h b/src/math_utils.h index d465c87..f6b3d19 100644 --- a/src/math_utils.h +++ b/src/math_utils.h @@ -27,7 +27,6 @@ SOFTWARE. #include #include -#include static const double PI = 3.14159265358979; @@ -47,20 +46,21 @@ static double avg(T container, double defaultValue = 0) return sum / container.size(); } + template -static dlib::point centroid(Args&... args) +static Point centroid(Args&... args) { std::size_t numArgs = sizeof...(args); - if (numArgs == 0) return dlib::point(0, 0); + if (numArgs == 0) return Point(0, 0); double sumX = 0, sumY = 0; for (auto point : {args...}) { - sumX += point.x(); - sumY += point.y(); + sumX += point.x; + sumY += point.y; } - return dlib::point(sumX / numArgs, sumY / numArgs); + return Point(sumX / numArgs, sumY / numArgs); } static inline double sq(double x) @@ -88,10 +88,10 @@ static inline double degToRad(double deg) return deg * PI / 180; } -double dist(dlib::point& p1, dlib::point& p2) +double dist(Point& p1, Point& p2) { - double xDist = p1.x() - p2.x(); - double yDist = p1.y() - p2.y(); + double xDist = p1.x - p2.x; + double yDist = p1.y - p2.y; return std::hypot(xDist, yDist); } -- 2.7.4