Skip to content

Commit eafee53

Browse files
detect speaker from videp
1 parent acd23ba commit eafee53

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# USAGE
2+
# python speaking_detection.py --shape-predictor shape_predictor_68_face_landmarks.dat
3+
# python speaking_detection.py --shape-predictor shape_predictor_68_face_landmarks.dat --picamera 1
4+
5+
# import the necessary packages
6+
from imutils.video import VideoStream
7+
from imutils import face_utils
8+
import datetime
9+
import argparse
10+
import imutils
11+
import time
12+
import dlib
13+
import cv2
14+
import numpy as np
15+
16+
17+
def is_speaking(prev_img, curr_img, debug=False, threshold=500, width=400, height=400):
18+
"""
19+
Args:
20+
prev_img:
21+
curr_img:
22+
Returns:
23+
Bool value if a person is speaking or not
24+
"""
25+
prev_img = cv2.resize(prev_img, (width, height))
26+
curr_img = cv2.resize(curr_img, (width, height))
27+
28+
diff = cv2.absdiff(prev_img, curr_img)
29+
norm = np.sum(diff) / (width*height) * 100
30+
if debug:
31+
print(norm)
32+
return norm > threshold
33+
34+
35+
# construct the argument parse and parse the arguments
36+
ap = argparse.ArgumentParser()
37+
ap.add_argument("-p", "--shape-predictor", required=True,
38+
help="path to facial landmark predictor")
39+
ap.add_argument("-r", "--picamera", type=int, default=-1,
40+
help="whether or not the Raspberry Pi camera should be used")
41+
ap.add_argument("-t", "--threshold", type=int, default=500,
42+
help="threshold of speaking or not")
43+
ap.add_argument("-d", "--debug", action='store_true')
44+
ap.add_argument("-w", "--width", type=int, default=800,
45+
help="width of window")
46+
args = vars(ap.parse_args())
47+
48+
# initialize dlib's face detector (HOG-based) and then create
49+
# the facial landmark predictor
50+
print("[INFO] loading facial landmark predictor...")
51+
detector = dlib.get_frontal_face_detector()
52+
predictor = dlib.shape_predictor(args["shape_predictor"])
53+
54+
# grab the indices of the facial landmarks for mouth
55+
m_start, m_end = face_utils.FACIAL_LANDMARKS_IDXS['mouth']
56+
57+
# initialize the video stream and allow the cammera sensor to warmup
58+
#print("[INFO] camera sensor warming up...")
59+
60+
#vs = VideoStream(usePiCamera=args["picamera"] > 0).start()
61+
#time.sleep(2.0)
62+
video_file = "/Users/aryaman/research/FER_datasets/video/videos/ErinBrockavich_shot_2.mp4"
63+
cap = cv2.VideoCapture(video_file)
64+
65+
prev_mouth_img = None
66+
i = 0
67+
margin = 10
68+
frame_count = 0
69+
# loop over the frames from the video stream
70+
while(cap.isOpened()):
71+
# grab the frame from the threaded video stream, resize it to
72+
# have a maximum width of 400 pixels, and convert it to
73+
# grayscale
74+
ret, frame = cap.read()
75+
frame_count = frame_count + 1
76+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
77+
frame = imutils.resize(frame, width=args["width"])
78+
79+
# detect faces in the grayscale frame
80+
rects = detector(gray, 0)
81+
82+
# loop over the face detections
83+
for rect in rects:
84+
# determine the facial landmarks for the face region, then
85+
# convert the facial landmark (x, y)-coordinates to a NumPy
86+
# array
87+
shape = predictor(gray, rect)
88+
shape = face_utils.shape_to_np(shape)
89+
90+
mouth_shape = shape[m_start:m_end+1]
91+
92+
leftmost_x = min(x for x, y in mouth_shape) - margin
93+
bottom_y = min(y for x, y in mouth_shape) - margin
94+
rightmost_x = max(x for x, y in mouth_shape) + margin
95+
top_y = max(y for x, y in mouth_shape) + margin
96+
97+
w = rightmost_x - leftmost_x
98+
h = top_y - bottom_y
99+
100+
x = int(leftmost_x - 0.1 * w)
101+
y = int(bottom_y - 0.1 * h)
102+
103+
w = int(1.2 * w)
104+
h = int(1.2 * h)
105+
106+
mouth_img = gray[bottom_y:top_y, leftmost_x:rightmost_x]
107+
108+
# loop over the (x, y)-coordinates for the facial landmarks
109+
# and draw them on the image
110+
# for (x, y) in mouth_shape:
111+
# cv2.circle(frame, (x, y), 1, (0, 0, 255), -1)
112+
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
113+
114+
# confer this
115+
# https://github.com/seanexplode/LipReader/blob/master/TrackFaces.c#L68
116+
if prev_mouth_img is None:
117+
prev_mouth_img = mouth_img
118+
if is_speaking(prev_mouth_img, mouth_img, threshold=args['threshold'],
119+
debug=args['debug']):
120+
print(str(i), "speaking, frame count: ", frame_count)
121+
i += 1
122+
123+
prev_mouth_img = mouth_img
124+
125+
# show the frame
126+
#cv2.imshow("Frame", frame)
127+
#key = cv2.waitKey(1) & 0xFF
128+
129+
# if the `q` key was pressed, break from the loop
130+
#if key == ord("q"):
131+
# break
132+
133+
# do a bit of cleanup
134+
cap.release()
135+
cv2.destroyAllWindows()

0 commit comments

Comments
 (0)