17 #include "objdetectdata.pb.h"
20 #define uint64 uint64_t
21 #include <opencv2/core/ocl.hpp>
35 #include <google/protobuf/util/time_util.h>
38 using google::protobuf::util::TimeUtil;
42 std::string LoadONNXModel(
const std::string& modelPath, cv::dnn::Net* net)
45 cv::dnn::Net loadedNet = cv::dnn::readNetFromONNX(modelPath);
49 }
catch (
const cv::Exception& e) {
50 return std::string(
"Failed to load ONNX model: ") + e.what();
51 }
catch (
const std::exception& e) {
52 return std::string(
"Failed to load ONNX model: ") + e.what();
56 std::vector<uint32_t> EncodeBinaryMaskRLE(
const cv::Mat& mask)
58 std::vector<uint32_t> rle;
64 for (
int y = 0; y < mask.rows; ++y) {
65 const uint8_t* row = mask.ptr<uint8_t>(y);
66 for (
int x = 0; x < mask.cols; ++x) {
67 const uint8_t value = row[x] ? 1 : 0;
68 if (value == current) {
81 struct EfficientSamPreprocessResult {
87 EfficientSamPreprocessResult MakeEfficientSamBlob(
const cv::Mat& bgr,
int modelSize)
89 EfficientSamPreprocessResult result;
90 result.scaleX =
static_cast<float>(modelSize) /
static_cast<float>(bgr.cols);
91 result.scaleY =
static_cast<float>(modelSize) /
static_cast<float>(bgr.rows);
94 cv::resize(bgr, resized, cv::Size(modelSize, modelSize), 0, 0, cv::INTER_LINEAR);
96 const int shape[] = {1, 3, modelSize, modelSize};
97 result.blob = cv::Mat(4, shape, CV_32F);
98 float* dst = result.blob.ptr<
float>();
100 for (
int y = 0; y < resized.rows; ++y) {
101 const cv::Vec3b* row = resized.ptr<cv::Vec3b>(y);
102 for (
int x = 0; x < resized.cols; ++x) {
103 const float rgb[] = {
104 static_cast<float>(row[x][2]) / 255.0f,
105 static_cast<float>(row[x][1]) / 255.0f,
106 static_cast<float>(row[x][0]) / 255.0f,
108 for (
int c = 0; c < 3; ++c)
109 dst[(c * modelSize + y) * modelSize + x] = rgb[c];
116 cv::Rect_<float> NormalizedBoundingBox(
const cv::Mat& mask)
118 std::vector<cv::Point> points;
119 cv::findNonZero(mask, points);
123 cv::Rect rect = cv::boundingRect(points);
124 return cv::Rect_<float>(
125 rect.x /
static_cast<float>(mask.cols),
126 rect.y /
static_cast<float>(mask.rows),
127 rect.width /
static_cast<float>(mask.cols),
128 rect.height /
static_cast<float>(mask.rows));
131 cv::Mat EfficientSamMaskToFrameMask(
const cv::Mat& modelMask,
const cv::Size& frameSize,
float maskThreshold)
134 cv::resize(modelMask, fullSize, frameSize, 0, 0, cv::INTER_LINEAR);
137 cv::threshold(fullSize, binary, maskThreshold, 255.0, cv::THRESH_BINARY);
138 if (cv::countNonZero(binary) == 0) {
139 double maxValue = 0.0;
140 cv::minMaxLoc(fullSize,
nullptr, &maxValue);
141 if (maxValue > 0.0) {
142 cv::threshold(fullSize, binary, maxValue * 0.5, 255.0, cv::THRESH_BINARY);
145 binary.convertTo(binary, CV_8U);
149 cv::Mat MakeEfficientSamPromptBlob(
151 const EfficientSamPreprocessResult& prep,
153 std::vector<cv::Point>& backgroundPoints,
154 std::vector<cv::Rect>& backgroundRects)
156 const int coordsShape[] = {1, 1, promptSlots, 2};
157 cv::Mat pointCoords(4, coordsShape, CV_32F, cv::Scalar(0.0f));
159 float* coords = pointCoords.ptr<
float>();
162 if (promptIndex + 1 >= promptSlots)
164 coords[promptIndex * 2] = rect.x * prep.scaleX;
165 coords[promptIndex * 2 + 1] = rect.y * prep.scaleY;
167 coords[promptIndex * 2] = (rect.x + rect.width) * prep.scaleX;
168 coords[promptIndex * 2 + 1] = (rect.y + rect.height) * prep.scaleY;
172 if (promptIndex >= promptSlots)
174 coords[promptIndex * 2] = point.x * prep.scaleX;
175 coords[promptIndex * 2 + 1] = point.y * prep.scaleY;
179 backgroundPoints.emplace_back(
180 static_cast<int>(std::lround(point.x * prep.scaleX)),
181 static_cast<int>(std::lround(point.y * prep.scaleY)));
184 const int x1 =
static_cast<int>(std::floor(rect.x * prep.scaleX));
185 const int y1 =
static_cast<int>(std::floor(rect.y * prep.scaleY));
186 const int x2 =
static_cast<int>(std::ceil((rect.x + rect.width) * prep.scaleX));
187 const int y2 =
static_cast<int>(std::ceil((rect.y + rect.height) * prep.scaleY));
188 const int modelWidth = prep.blob.size[3];
189 const int modelHeight = prep.blob.size[2];
190 const int left = std::max(0, std::min(modelWidth - 1, x1));
191 const int top = std::max(0, std::min(modelHeight - 1, y1));
192 const int right = std::max(left + 1, std::min(modelWidth, x2));
193 const int bottom = std::max(top + 1, std::min(modelHeight, y2));
194 backgroundRects.emplace_back(left, top, right - left, bottom - top);
202 const int labelsShape[] = {1, 1, promptSlots, 1};
203 cv::Mat pointLabels(4, labelsShape, CV_32F, cv::Scalar(-1.0f));
205 float* labels = pointLabels.ptr<
float>();
207 for (
size_t i = 0; i < prompts.
positiveRects.size() && promptIndex + 1 < promptSlots; ++i) {
208 labels[promptIndex++] = 2.0f;
209 labels[promptIndex++] = 3.0f;
211 for (
size_t i = 0; i < prompts.
positivePoints.size() && promptIndex < promptSlots; ++i, ++promptIndex)
212 labels[promptIndex] = 1.0f;
217 cv::Mat SelectEfficientSamMask(
const cv::Mat& outputMasks,
const cv::Mat& iouPredictions,
218 const std::vector<cv::Point>& backgroundPoints,
219 const std::vector<cv::Rect>& backgroundRects,
222 if (outputMasks.dims != 5 || iouPredictions.empty())
225 const int candidateCount = outputMasks.size[2];
226 const int maskHeight = outputMasks.size[3];
227 const int maskWidth = outputMasks.size[4];
228 const float* ious = iouPredictions.ptr<
float>();
230 const float* masks = outputMasks.ptr<
float>();
231 const size_t candidatePixels =
static_cast<size_t>(maskHeight) *
static_cast<size_t>(maskWidth);
233 float bestScore = -std::numeric_limits<float>::infinity();
234 for (
int candidate = 0; candidate < candidateCount; ++candidate) {
235 cv::Mat mask(maskHeight, maskWidth, CV_32F,
236 const_cast<float*
>(masks +
static_cast<size_t>(candidate) * candidatePixels));
238 int backgroundHits = 0;
239 for (
const cv::Point& point : backgroundPoints) {
240 const int x = std::max(0, std::min(maskWidth - 1, point.x));
241 const int y = std::max(0, std::min(maskHeight - 1, point.y));
242 if (mask.at<
float>(y, x) >= maskThreshold)
246 float rectOverlapPenalty = 0.0f;
247 for (
const cv::Rect& rect : backgroundRects) {
248 const cv::Rect clipped = rect & cv::Rect(0, 0, maskWidth, maskHeight);
249 const int area = clipped.area();
253 for (
int y = clipped.y; y < clipped.y + clipped.height; ++y) {
254 const float* row = mask.ptr<
float>(y);
255 for (
int x = clipped.x; x < clipped.x + clipped.width; ++x) {
256 if (row[x] >= maskThreshold)
260 rectOverlapPenalty +=
static_cast<float>(overlap) /
static_cast<float>(area);
263 const float pointPenalty = backgroundPoints.empty()
265 :
static_cast<float>(backgroundHits) /
static_cast<float>(backgroundPoints.size());
266 if (!backgroundRects.empty())
267 rectOverlapPenalty /=
static_cast<float>(backgroundRects.size());
269 const float score = ious[candidate] - (0.35f * pointPenalty) - (0.75f * rectOverlapPenalty);
270 if (bestMask.empty() || score > bestScore) {
272 bestMask = mask.clone();
286 frameData.
score = score;
287 frameData.
width = mask.cols;
288 frameData.
height = mask.rows;
289 frameData.
rle = EncodeBinaryMaskRLE(mask);
290 frameData.
box = NormalizedBoundingBox(mask);
294 cv::Point2f JsonPoint(
const Json::Value& value)
296 if (!value.isObject() || value[
"x"].isNull() || value[
"y"].isNull())
297 return cv::Point2f(-1.0f, -1.0f);
298 return cv::Point2f(value[
"x"].asFloat(), value[
"y"].asFloat());
301 bool IsValidPoint(
const cv::Point2f& point)
303 return point.x >= 0.0f && point.y >= 0.0f;
306 void AppendJsonPoints(
const Json::Value& values, std::vector<cv::Point2f>& points)
308 if (!values.isArray())
310 for (
const auto& value : values) {
311 cv::Point2f point = JsonPoint(value);
312 if (IsValidPoint(point))
313 points.push_back(point);
317 size_t JsonFrameNumber(
const std::string& frameName)
320 return static_cast<size_t>(std::max(0, std::stoi(frameName)));
326 bool RectFromJson(
const Json::Value& rect, cv::Rect_<float>& output)
328 if (!rect.isObject() || rect[
"x1"].isNull() || rect[
"y1"].isNull() ||
329 rect[
"x2"].isNull() || rect[
"y2"].isNull()) {
333 const float x1 = std::min(rect[
"x1"].asFloat(), rect[
"x2"].asFloat());
334 const float y1 = std::min(rect[
"y1"].asFloat(), rect[
"y2"].asFloat());
335 const float x2 = std::max(rect[
"x1"].asFloat(), rect[
"x2"].asFloat());
336 const float y2 = std::max(rect[
"y1"].asFloat(), rect[
"y2"].asFloat());
337 cv::Point2f topLeft(x1, y1);
338 cv::Point2f bottomRight(x2, y2);
339 if (!IsValidPoint(topLeft) || !IsValidPoint(bottomRight) || x2 <= x1 || y2 <= y1)
342 output = cv::Rect_<float>(x1, y1, x2 - x1, y2 - y1);
346 void AppendJsonRects(
const Json::Value& values, std::vector<cv::Rect_<float>>& rects)
348 if (!values.isArray())
350 for (
const auto& rect : values) {
351 cv::Rect_<float> parsed;
352 if (RectFromJson(rect, parsed))
353 rects.push_back(parsed);
360 AppendJsonPoints(framePayload[
"positive_points"], prompts.
positivePoints);
361 AppendJsonPoints(framePayload[
"negative_points"], prompts.
negativePoints);
362 AppendJsonRects(framePayload[
"positive_rects"], prompts.
positiveRects);
363 AppendJsonRects(framePayload[
"negative_rects"], prompts.
negativeRects);
367 cv::Mat MakeBlob(
const std::vector<int>& shape,
float value = 0.0f)
369 cv::Mat output(
static_cast<int>(shape.size()), shape.data(), CV_32F);
374 std::string SetNetDevice(cv::dnn::Net& net,
const std::string& processingDevice)
376 if (processingDevice ==
"CPU") {
377 net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
378 net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
382 if (processingDevice ==
"GPU" || processingDevice ==
"GPU_AUTO" || processingDevice ==
"GPU_CUDA") {
384 const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA);
385 if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) {
386 net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
387 net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
390 }
catch (
const cv::Exception&) {
394 if (processingDevice ==
"GPU_OPENCL") {
396 const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV);
397 if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) {
398 cv::ocl::setUseOpenCL(
true);
399 net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
400 net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
403 }
catch (
const cv::Exception&) {
407 net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
408 net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
412 class CutiePropagator {
414 static constexpr
int memorySlots = 6;
415 int modelWidth = 640;
416 int modelHeight = 368;
417 int stride16Width = modelWidth / 16;
418 int stride16Height = modelHeight / 16;
427 struct LetterboxTransform {
428 cv::Size originalSize;
429 cv::Rect contentRect;
432 cv::dnn::Net encodeKey;
433 cv::dnn::Net encodeValue;
434 cv::dnn::Net memoryReadout;
438 cv::Mat objectMemory;
439 MemoryFrame permanentMemory;
440 bool hasPermanentMemory =
false;
441 std::deque<MemoryFrame> workingMemoryFrames;
443 int lastMemoryFrame = -1000000;
445 int maxMemoryFrames = memorySlots;
447 static bool ParseModelSize(
const std::string& modelPath,
int& width,
int& height)
449 size_t xPos = modelPath.find(
'x');
450 while (xPos != std::string::npos) {
451 size_t widthStart = xPos;
452 while (widthStart > 0 && std::isdigit(
static_cast<unsigned char>(modelPath[widthStart - 1])))
455 size_t heightEnd = xPos + 1;
456 while (heightEnd < modelPath.size() && std::isdigit(
static_cast<unsigned char>(modelPath[heightEnd])))
459 if (widthStart != xPos && heightEnd != xPos + 1) {
460 width = std::stoi(modelPath.substr(widthStart, xPos - widthStart));
461 height = std::stoi(modelPath.substr(xPos + 1, heightEnd - xPos - 1));
462 if (width > 0 && height > 0 && width % 16 == 0 && height % 16 == 0)
465 xPos = modelPath.find(
'x', xPos + 1);
470 void ConfigureModelSize(
const std::string& modelPath)
472 int width = modelWidth;
473 int height = modelHeight;
474 if (!ParseModelSize(modelPath, width, height))
477 modelHeight = height;
478 stride16Width = modelWidth / 16;
479 stride16Height = modelHeight / 16;
482 LetterboxTransform ComputeLetterbox(
const cv::Size& sourceSize)
const
484 LetterboxTransform transform;
485 transform.originalSize = sourceSize;
486 if (sourceSize.width <= 0 || sourceSize.height <= 0) {
487 transform.contentRect = cv::Rect(0, 0, modelWidth, modelHeight);
491 const float scaleX =
static_cast<float>(modelWidth) /
static_cast<float>(sourceSize.width);
492 const float scaleY =
static_cast<float>(modelHeight) /
static_cast<float>(sourceSize.height);
493 const float scale = std::min(scaleX, scaleY);
495 const int resizedWidth = std::max(1, std::min(
496 modelWidth,
static_cast<int>(std::lround(sourceSize.width * scale))));
497 const int resizedHeight = std::max(1, std::min(
498 modelHeight,
static_cast<int>(std::lround(sourceSize.height * scale))));
499 const int offsetX = (modelWidth - resizedWidth) / 2;
500 const int offsetY = (modelHeight - resizedHeight) / 2;
501 transform.contentRect = cv::Rect(offsetX, offsetY, resizedWidth, resizedHeight);
505 cv::Mat MakeImageBlob(
const cv::Mat& bgr,
const LetterboxTransform& transform)
const
508 cv::resize(bgr, resized, transform.contentRect.size(), 0, 0, cv::INTER_LINEAR);
509 cv::Mat canvas(modelHeight, modelWidth, bgr.type(), cv::Scalar::all(0));
510 resized.copyTo(canvas(transform.contentRect));
512 const int shape[] = {1, 3, modelHeight, modelWidth};
513 cv::Mat blob(4, shape, CV_32F);
514 float* dst = blob.ptr<
float>();
515 for (
int y = 0; y < canvas.rows; ++y) {
516 const cv::Vec3b* row = canvas.ptr<cv::Vec3b>(y);
517 for (
int x = 0; x < canvas.cols; ++x) {
518 dst[(0 * modelHeight + y) * modelWidth + x] =
static_cast<float>(row[x][2]) / 255.0f;
519 dst[(1 * modelHeight + y) * modelWidth + x] =
static_cast<float>(row[x][1]) / 255.0f;
520 dst[(2 * modelHeight + y) * modelWidth + x] =
static_cast<float>(row[x][0]) / 255.0f;
526 cv::Mat MakeMaskBlob(
const cv::Mat& mask,
const LetterboxTransform& transform)
const
529 cv::resize(mask, resized, transform.contentRect.size(), 0, 0, cv::INTER_NEAREST);
530 cv::Mat canvas(modelHeight, modelWidth, CV_8U, cv::Scalar(0));
531 resized.copyTo(canvas(transform.contentRect));
533 const int shape[] = {1, 1, modelHeight, modelWidth};
534 cv::Mat blob(4, shape, CV_32F, cv::Scalar(0.0f));
535 float* dst = blob.ptr<
float>();
536 for (
int y = 0; y < canvas.rows; ++y) {
537 const uint8_t* row = canvas.ptr<uint8_t>(y);
538 for (
int x = 0; x < canvas.cols; ++x)
539 dst[y * modelWidth + x] = row[x] ? 1.0f : 0.0f;
544 cv::Mat ForegroundFromProb(
const cv::Mat& prob)
const
546 const int shape[] = {1, 1, modelHeight, modelWidth};
547 cv::Mat foreground(4, shape, CV_32F);
548 const float* src = prob.ptr<
float>();
549 float* dst = foreground.ptr<
float>();
550 const int plane = modelWidth * modelHeight;
551 std::memcpy(dst, src + plane,
sizeof(
float) * plane);
555 cv::Mat BinaryMaskFromForeground(
const cv::Mat& foreground,
const LetterboxTransform& transform)
const
557 cv::Mat modelMask(modelHeight, modelWidth, CV_8U, cv::Scalar(0));
558 const float* src = foreground.ptr<
float>();
559 for (
int y = 0; y < modelMask.rows; ++y) {
560 uint8_t* row = modelMask.ptr<uint8_t>(y);
561 for (
int x = 0; x < modelMask.cols; ++x)
562 row[x] = src[y * modelWidth + x] >= 0.5f ? 255 : 0;
565 cv::Mat cropped = modelMask(transform.contentRect);
567 cv::resize(cropped, restored, transform.originalSize, 0, 0, cv::INTER_NEAREST);
571 cv::Mat ValidMaskFromLetterbox(
const LetterboxTransform& transform)
const
573 cv::Mat valid(stride16Height, stride16Width, CV_32F, cv::Scalar(0.0f));
574 for (
int y = 0; y < stride16Height; ++y) {
575 float* row = valid.ptr<
float>(y);
576 const int centerY = y * 16 + 8;
577 for (
int x = 0; x < stride16Width; ++x) {
578 const int centerX = x * 16 + 8;
579 if (transform.contentRect.contains(cv::Point(centerX, centerY)))
584 const int shape[] = {1, 1, stride16Height, stride16Width};
585 cv::Mat blob(4, shape, CV_32F);
586 std::memcpy(blob.ptr<
float>(), valid.ptr<
float>(),
sizeof(
float) * valid.total());
590 void CopyKeySlot(
const cv::Mat& src, cv::Mat& dst,
int slot,
int channels)
const
592 const float* in = src.ptr<
float>();
593 float* out = dst.ptr<
float>();
594 const int plane = stride16Width * stride16Height;
595 for (
int c = 0; c < channels; ++c) {
596 std::memcpy(out + (c * memorySlots + slot) * plane,
598 sizeof(
float) * plane);
602 void CopyValueSlot(
const cv::Mat& src, cv::Mat& dst,
int slot)
const
604 const float* in = src.ptr<
float>();
605 float* out = dst.ptr<
float>();
606 const int plane = stride16Width * stride16Height;
607 for (
int c = 0; c < 256; ++c) {
608 std::memcpy(out + (c * memorySlots + slot) * plane,
610 sizeof(
float) * plane);
614 cv::Mat MemoryKeyBlob()
const
616 cv::Mat output = MakeBlob({1, 64, memorySlots, stride16Height, stride16Width});
618 if (hasPermanentMemory)
619 CopyKeySlot(permanentMemory.key, output, slot++, 64);
621 index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
623 CopyKeySlot(workingMemoryFrames[index].key, output, slot, 64);
627 cv::Mat MemoryShrinkageBlob()
const
629 cv::Mat output = MakeBlob({1, 1, memorySlots, stride16Height, stride16Width});
631 if (hasPermanentMemory)
632 CopyKeySlot(permanentMemory.shrinkage, output, slot++, 1);
634 index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
636 CopyKeySlot(workingMemoryFrames[index].shrinkage, output, slot, 1);
640 cv::Mat MemoryValueBlob()
const
642 cv::Mat output = MakeBlob({1, 1, 256, memorySlots, stride16Height, stride16Width});
644 if (hasPermanentMemory)
645 CopyValueSlot(permanentMemory.value, output, slot++);
647 index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
649 CopyValueSlot(workingMemoryFrames[index].value, output, slot);
653 cv::Mat MemoryValidBlob()
const
655 cv::Mat output = MakeBlob({1, 1, memorySlots, stride16Height, stride16Width});
656 float* data = output.ptr<
float>();
657 const int plane = stride16Width * stride16Height;
658 auto copyValidSlot = [&](
const cv::Mat& valid,
int slot) {
659 std::memcpy(data + slot * plane, valid.ptr<
float>(),
sizeof(
float) * plane);
663 if (hasPermanentMemory)
664 copyValidSlot(permanentMemory.valid, slot++);
666 index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
668 copyValidSlot(workingMemoryFrames[index].valid, slot);
672 void AddMemory(
const cv::Mat& key,
const cv::Mat& shrinkage,
const cv::Mat& value,
673 const cv::Mat& valid,
bool asPermanent)
676 frame.key = key.clone();
677 frame.shrinkage = shrinkage.clone();
678 frame.value = value.clone();
679 frame.valid = valid.clone();
681 if (asPermanent || !hasPermanentMemory) {
682 permanentMemory = frame;
683 hasPermanentMemory =
true;
687 workingMemoryFrames.push_back(frame);
688 const int workingCapacity = std::max(0, maxMemoryFrames - 1);
689 while (
static_cast<int>(workingMemoryFrames.size()) > workingCapacity)
690 workingMemoryFrames.pop_front();
693 void AddObjectMemory(
const cv::Mat& value)
695 if (objectMemory.empty()) {
696 objectMemory = MakeBlob({1, 1, 1, 16, 257});
697 std::memcpy(objectMemory.ptr<
float>(), value.ptr<
float>(),
sizeof(
float) * value.total());
701 float* dst = objectMemory.ptr<
float>();
702 const float* src = value.ptr<
float>();
703 for (
size_t i = 0; i < value.total(); ++i)
708 void Load(
const std::string& encodeKeyPath,
const std::string& encodeValuePath,
709 const std::string& memoryReadoutPath,
const std::string& decodePath)
711 ConfigureModelSize(encodeKeyPath);
712 encodeKey = cv::dnn::readNetFromONNX(encodeKeyPath);
713 encodeValue = cv::dnn::readNetFromONNX(encodeValuePath);
714 memoryReadout = cv::dnn::readNetFromONNX(memoryReadoutPath);
715 decode = cv::dnn::readNetFromONNX(decodePath);
716 sensory = MakeBlob({1, 1, 256, stride16Height, stride16Width});
719 std::string SetDevice(
const std::string& processingDevice)
721 std::string selected = SetNetDevice(encodeKey, processingDevice);
722 const std::string valueDevice = SetNetDevice(encodeValue, processingDevice);
723 const std::string readoutDevice = SetNetDevice(memoryReadout, processingDevice);
724 const std::string decodeDevice = SetNetDevice(decode, processingDevice);
725 if (selected != valueDevice || selected != readoutDevice || selected != decodeDevice)
732 sensory = MakeBlob({1, 1, 256, stride16Height, stride16Width});
734 objectMemory.release();
735 permanentMemory = MemoryFrame();
736 hasPermanentMemory =
false;
737 workingMemoryFrames.clear();
739 lastMemoryFrame = -1000000;
742 bool HasMemory()
const
744 return hasPermanentMemory || !workingMemoryFrames.empty();
747 cv::Mat Step(
const cv::Mat& frame,
const cv::Mat& seedMask = cv::Mat())
749 const LetterboxTransform transform = ComputeLetterbox(frame.size());
750 const cv::Mat validMask = ValidMaskFromLetterbox(transform);
751 cv::Mat image = MakeImageBlob(frame, transform);
753 encodeKey.setInput(image,
"image");
754 std::vector<cv::Mat> keyOutputs;
755 encodeKey.forward(keyOutputs, std::vector<cv::String>{
"f16",
"f8",
"f4",
"pix_feat",
"key",
"shrinkage",
"selection"});
756 cv::Mat f8 = keyOutputs[1];
757 cv::Mat f4 = keyOutputs[2];
758 cv::Mat pixFeat = keyOutputs[3];
759 cv::Mat key = keyOutputs[4];
760 cv::Mat shrinkage = keyOutputs[5];
761 cv::Mat selection = keyOutputs[6];
764 if (!seedMask.empty()) {
765 foreground = MakeMaskBlob(seedMask, transform);
766 }
else if (HasMemory()) {
767 memoryReadout.setInput(key,
"query_key");
768 memoryReadout.setInput(selection,
"query_selection");
769 memoryReadout.setInput(MemoryKeyBlob(),
"memory_key");
770 memoryReadout.setInput(MemoryShrinkageBlob(),
"memory_shrinkage");
771 memoryReadout.setInput(MemoryValueBlob(),
"memory_value");
772 memoryReadout.setInput(MemoryValidBlob(),
"memory_valid");
773 memoryReadout.setInput(objectMemory,
"object_memory");
774 memoryReadout.setInput(pixFeat,
"pix_feat");
775 memoryReadout.setInput(sensory,
"sensory");
776 memoryReadout.setInput(lastMask,
"last_mask");
777 std::vector<cv::Mat> readoutOutputs;
778 memoryReadout.forward(readoutOutputs, std::vector<cv::String>{
"memory_readout"});
780 decode.setInput(f8,
"f8");
781 decode.setInput(f4,
"f4");
782 decode.setInput(readoutOutputs[0],
"memory_readout");
783 decode.setInput(sensory,
"sensory");
784 std::vector<cv::Mat> decodeOutputs;
785 decode.forward(decodeOutputs, std::vector<cv::String>{
"new_sensory",
"logits",
"prob"});
786 sensory = decodeOutputs[0].clone();
787 foreground = ForegroundFromProb(decodeOutputs[2]);
793 const bool isMemoryFrame = !seedMask.empty() || frameIndex - lastMemoryFrame >= memEvery;
795 encodeValue.setInput(image,
"image");
796 encodeValue.setInput(pixFeat,
"pix_feat");
797 encodeValue.setInput(sensory,
"sensory");
798 encodeValue.setInput(foreground,
"mask");
799 std::vector<cv::Mat> valueOutputs;
800 encodeValue.forward(valueOutputs, std::vector<cv::String>{
"mask_value",
"new_sensory",
"object_memory"});
801 sensory = valueOutputs[1].clone();
802 AddObjectMemory(valueOutputs[2]);
803 AddMemory(key, shrinkage, valueOutputs[0], validMask, !seedMask.empty());
804 lastMemoryFrame = frameIndex;
807 lastMask = foreground.clone();
808 cv::Mat outputMask = BinaryMaskFromForeground(foreground, transform);
817 : processingController(&controller)
824 return LoadONNXModel(modelPath,
nullptr);
829 if (!frame || efficientSamModelPath.empty() || promptKeyframes.empty())
830 return std::shared_ptr<Frame>();
832 std::string loadError = LoadONNXModel(efficientSamModelPath, &efficientSam);
833 if (!loadError.empty())
834 return std::shared_ptr<Frame>();
835 SetProcessingDevice();
838 cv::Mat frameImage = frame->GetImageCV();
839 cv::Mat seedMask = CreateEfficientSAMSeedMask(frameImage, prompts);
840 if (seedMask.empty())
841 return std::shared_ptr<Frame>();
843 auto maskImage = std::make_shared<QImage>(
844 seedMask.cols, seedMask.rows, QImage::Format_RGBA8888_Premultiplied);
845 maskImage->fill(Qt::transparent);
846 for (
int y = 0; y < seedMask.rows; ++y) {
847 const uint8_t* src = seedMask.ptr<uint8_t>(y);
848 QRgb* dst =
reinterpret_cast<QRgb*
>(maskImage->scanLine(y));
849 for (
int x = 0; x < seedMask.cols; ++x)
850 dst[x] = src[x] ? qRgba(255, 255, 255, 255) : qRgba(0, 0, 0, 0);
853 auto result = std::make_shared<Frame>(frame->number, seedMask.cols, seedMask.rows,
"#000000");
854 result->AddImage(maskImage);
858 void CVObjectMask::SetProcessingDevice()
860 const std::string requestedDevice = processingDevice;
861 if (processingDevice ==
"CPU") {
862 efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
863 efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
868 if (processingDevice ==
"GPU" || processingDevice ==
"GPU_AUTO" || processingDevice ==
"GPU_CUDA") {
870 const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA);
871 if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) {
872 efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
873 efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
874 ZmqLogger::Instance()->
Log(
"Object Mask EfficientSAM DNN device: requested " + requestedDevice +
", selected CUDA");
877 }
catch (
const cv::Exception&) {
881 if (processingDevice ==
"GPU_OPENCL") {
883 const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV);
884 if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) {
885 cv::ocl::setUseOpenCL(
true);
886 efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
887 efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
888 ZmqLogger::Instance()->
Log(
"Object Mask EfficientSAM DNN device: requested " + requestedDevice +
", selected OpenCL");
891 }
catch (
const cv::Exception&) {
895 processingDevice =
"CPU";
896 efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
897 efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
898 ZmqLogger::Instance()->
Log(
"Object Mask EfficientSAM DNN device: requested " + requestedDevice +
", selected CPU");
907 processingController->
SetError(
false,
"");
909 if (efficientSamModelPath.empty()) {
910 processingController->
SetError(
true,
"Missing path to EfficientSAM ONNX model file");
914 if (protobufDataPath.empty()) {
915 processingController->
SetError(
true,
"Missing path to object mask protobuf data file");
919 if (promptKeyframes.empty()) {
920 processingController->
SetError(
true,
"Missing positive prompt point for Object Mask preprocessing");
925 std::string loadError = LoadONNXModel(efficientSamModelPath, &efficientSam);
926 if (!loadError.empty()) {
927 processingController->
SetError(
true, loadError);
931 SetProcessingDevice();
933 CutiePropagator cutie;
934 if (cutieEncodeKeyModelPath.empty() && !cutieModelDir.empty())
935 cutieEncodeKeyModelPath = cutieModelDir +
"/cutie-encode-key-640x368.onnx";
936 if (cutieEncodeValueModelPath.empty() && !cutieModelDir.empty())
937 cutieEncodeValueModelPath = cutieModelDir +
"/cutie-encode-value-640x368.onnx";
938 if (cutieMemoryReadoutModelPath.empty() && !cutieModelDir.empty())
939 cutieMemoryReadoutModelPath = cutieModelDir +
"/cutie-memory-readout-floatmask-valid-640x368-m6-topk30-opencv.onnx";
940 if (cutieDecodeModelPath.empty() && !cutieModelDir.empty())
941 cutieDecodeModelPath = cutieModelDir +
"/cutie-decode-640x368.onnx";
942 if (cutieEncodeKeyModelPath.empty() || cutieEncodeValueModelPath.empty() ||
943 cutieMemoryReadoutModelPath.empty() || cutieDecodeModelPath.empty()) {
944 processingController->
SetError(
true,
"Missing path to Cutie ONNX model files");
949 cutie.Load(cutieEncodeKeyModelPath, cutieEncodeValueModelPath, cutieMemoryReadoutModelPath, cutieDecodeModelPath);
950 const std::string cutieDevice = cutie.SetDevice(processingDevice);
951 ZmqLogger::Instance()->
Log(
"Object Mask Cutie DNN device: requested " + processingDevice +
", selected " + cutieDevice);
952 }
catch (
const cv::Exception& e) {
953 processingController->
SetError(
true, std::string(
"Failed to load Cutie ONNX models: ") + e.what());
956 }
catch (
const std::exception& e) {
957 processingController->
SetError(
true, std::string(
"Failed to load Cutie ONNX models: ") + e.what());
962 if (!process_interval || end <= 1 || end - start == 0) {
963 start =
static_cast<size_t>(video.
Start() * video.
Reader()->info.fps.ToFloat());
964 end =
static_cast<size_t>(video.
End() * video.
Reader()->info.fps.ToFloat());
970 auto promptBeforeStart = promptKeyframes.upper_bound(start);
971 if (promptBeforeStart != promptKeyframes.begin()) {
973 activePrompts = promptBeforeStart->second;
975 auto firstPromptAtOrAfterStart = promptKeyframes.lower_bound(start);
977 for (
size_t frameNumber = start; frameNumber <= end; ++frameNumber) {
981 std::shared_ptr<openshot::Frame> frame = video.
GetFrame(frameNumber);
985 auto promptIt = promptKeyframes.find(frameNumber);
986 bool isPromptKeyframe = promptIt != promptKeyframes.end();
987 if (promptIt != promptKeyframes.end()) {
988 activePrompts = promptIt->second;
991 if (firstPromptAtOrAfterStart != promptKeyframes.end() && frameNumber >= firstPromptAtOrAfterStart->first) {
992 activePrompts = firstPromptAtOrAfterStart->second;
993 isPromptKeyframe =
true;
997 emptyFrame.
frameId = frameNumber;
1003 const cv::Mat frameImage = frame->GetImageCV();
1005 if (isPromptKeyframe || !cutie.HasMemory()) {
1006 seedMask = CreateEfficientSAMSeedMask(frameImage, activePrompts);
1007 if (seedMask.empty()) {
1009 emptyFrame.
frameId = frameNumber;
1013 if (!isPromptKeyframe)
1017 cv::Mat propagatedMask;
1019 propagatedMask = cutie.Step(frameImage, seedMask);
1020 }
catch (
const cv::Exception& e) {
1021 processingController->
SetError(
true, std::string(
"Failed to propagate Object Mask with Cutie: ") + e.what());
1027 if (!seedMask.empty()) {
1028 outputMask = seedMask;
1029 }
else if (!propagatedMask.empty()) {
1030 cv::resize(propagatedMask, outputMask, frameImage.size(), 0, 0, cv::INTER_NEAREST);
1032 masksData[frameNumber] = FrameDataFromMask(outputMask, frameNumber, 1.0f);
1034 const size_t range = std::max<size_t>(1, end - start);
1035 processingController->
SetProgress(uint(100 * (frameNumber - start) / range));
1039 cv::Mat CVObjectMask::CreateEfficientSAMSeedMask(
const cv::Mat& frame,
const CVObjectMaskPromptSet& prompts)
1041 EfficientSamPreprocessResult prep = MakeEfficientSamBlob(frame, modelSize);
1044 std::vector<cv::Point> backgroundPoints;
1045 std::vector<cv::Rect> backgroundRects;
1046 cv::Mat pointCoords = MakeEfficientSamPromptBlob(promptSet, prep, promptSlots, backgroundPoints, backgroundRects);
1047 cv::Mat pointLabels = MakeEfficientSamLabelBlob(promptSet, promptSlots);
1049 efficientSam.setInput(prep.blob,
"batched_images");
1050 efficientSam.setInput(pointCoords,
"batched_point_coords");
1051 efficientSam.setInput(pointLabels,
"batched_point_labels");
1053 std::vector<cv::Mat> outputs;
1054 efficientSam.forward(outputs, std::vector<cv::String>{
"output_masks",
"iou_predictions"});
1055 if (outputs.size() != 2)
1058 cv::Mat modelMask = SelectEfficientSamMask(outputs[0], outputs[1], backgroundPoints, backgroundRects, maskThreshold);
1059 if (modelMask.empty())
1061 return EfficientSamMaskToFrameMask(modelMask, frame.size(), maskThreshold);
1065 return runPromptSet(prompts);
1067 cv::Mat combinedMask(frame.rows, frame.cols, CV_8U, cv::Scalar(0));
1068 bool hasMask =
false;
1074 cv::Mat rectMask = runPromptSet(rectPrompt);
1075 if (rectMask.empty())
1077 cv::bitwise_or(combinedMask, rectMask, combinedMask);
1086 cv::Mat pointMask = runPromptSet(pointPrompt);
1087 if (!pointMask.empty()) {
1088 cv::bitwise_or(combinedMask, pointMask, combinedMask);
1093 return hasMask ? combinedMask : cv::Mat();
1098 if (protobufDataPath.empty()) {
1099 std::cerr <<
"Missing path to object mask protobuf data file." << std::endl;
1105 pb_objdetect::ObjDetect objMessage;
1106 objMessage.add_classnames()->assign(
"object mask");
1109 AddFrameDataToProto(objMessage.add_frame(), frameData.second);
1111 *objMessage.mutable_last_updated() = TimeUtil::SecondsToTimestamp(time(NULL));
1113 std::fstream output(protobufDataPath, std::ios::out | std::ios::trunc | std::ios::binary);
1114 if (!objMessage.SerializeToOstream(&output)) {
1115 std::cerr <<
"Failed to write object mask protobuf message." << std::endl;
1122 void CVObjectMask::AddFrameDataToProto(pb_objdetect::Frame* pbFrameData,
const CVObjectMaskFrameData& frameData)
1124 pbFrameData->set_id(frameData.
frameId);
1128 pb_objdetect::Frame_Box* box = pbFrameData->add_bounding_box();
1129 box->set_x(frameData.
box.x);
1130 box->set_y(frameData.
box.y);
1131 box->set_w(frameData.
box.width);
1132 box->set_h(frameData.
box.height);
1133 box->set_classid(0);
1134 box->set_confidence(frameData.
score);
1135 box->set_objectid(frameData.
objectId);
1137 pb_objdetect::Frame_Box_Mask* mask = box->mutable_mask();
1138 mask->set_width(frameData.
width);
1139 mask->set_height(frameData.
height);
1140 for (uint32_t count : frameData.
rle)
1141 mask->add_rle(count);
1148 }
catch (
const std::exception&) {
1149 std::cout <<
"JSON is invalid (missing keys or invalid data types)" << std::endl;
1155 if (!root[
"protobuf_data_path"].isNull())
1156 protobufDataPath = root[
"protobuf_data_path"].asString();
1157 if (!root[
"efficient_sam_model"].isNull())
1158 efficientSamModelPath = root[
"efficient_sam_model"].asString();
1159 if (!root[
"efficient_sam_model_path"].isNull())
1160 efficientSamModelPath = root[
"efficient_sam_model_path"].asString();
1161 if (!root[
"sam_model"].isNull())
1162 efficientSamModelPath = root[
"sam_model"].asString();
1163 if (!root[
"sam_model_path"].isNull())
1164 efficientSamModelPath = root[
"sam_model_path"].asString();
1165 if (!root[
"encoder_model"].isNull())
1166 efficientSamModelPath = root[
"encoder_model"].asString();
1167 if (!root[
"encoder_model_path"].isNull())
1168 efficientSamModelPath = root[
"encoder_model_path"].asString();
1169 if (!root[
"cutie_model_dir"].isNull())
1170 cutieModelDir = root[
"cutie_model_dir"].asString();
1171 if (!root[
"cutie_encode_key_model"].isNull())
1172 cutieEncodeKeyModelPath = root[
"cutie_encode_key_model"].asString();
1173 if (!root[
"cutie_encode_key_model_path"].isNull())
1174 cutieEncodeKeyModelPath = root[
"cutie_encode_key_model_path"].asString();
1175 if (!root[
"cutie_encode_value_model"].isNull())
1176 cutieEncodeValueModelPath = root[
"cutie_encode_value_model"].asString();
1177 if (!root[
"cutie_encode_value_model_path"].isNull())
1178 cutieEncodeValueModelPath = root[
"cutie_encode_value_model_path"].asString();
1179 if (!root[
"cutie_memory_readout_model"].isNull())
1180 cutieMemoryReadoutModelPath = root[
"cutie_memory_readout_model"].asString();
1181 if (!root[
"cutie_memory_readout_model_path"].isNull())
1182 cutieMemoryReadoutModelPath = root[
"cutie_memory_readout_model_path"].asString();
1183 if (!root[
"cutie_decode_model"].isNull())
1184 cutieDecodeModelPath = root[
"cutie_decode_model"].asString();
1185 if (!root[
"cutie_decode_model_path"].isNull())
1186 cutieDecodeModelPath = root[
"cutie_decode_model_path"].asString();
1187 if (!root[
"processing-device"].isNull())
1188 processingDevice = root[
"processing-device"].asString();
1189 if (!root[
"processing_device"].isNull())
1190 processingDevice = root[
"processing_device"].asString();
1191 if (!root[
"prompt_slots"].isNull())
1192 promptSlots = std::max(1, std::min(6, root[
"prompt_slots"].asInt()));
1193 if (!root[
"mask_threshold"].isNull())
1194 maskThreshold = root[
"mask_threshold"].asFloat();
1195 if (!root[
"model_size"].isNull())
1196 modelSize = root[
"model_size"].asInt();
1197 promptKeyframes.clear();
1198 if (!root[
"object_mask_selection"].isNull()) {
1199 const Json::Value& selection = root[
"object_mask_selection"];
1200 const Json::Value& frames = selection[
"frames"];
1201 if (frames.isObject()) {
1202 for (
const auto& frameName : frames.getMemberNames()) {
1203 const size_t frameNumber = JsonFrameNumber(frameName);
1204 if (frameNumber == 0)
1208 promptKeyframes[frameNumber] = prompts;
1214 if (!root[
"positive_points"].isNull())
1215 AppendJsonPoints(root[
"positive_points"], legacyPrompts.
positivePoints);
1216 if (!root[
"negative_points"].isNull())
1217 AppendJsonPoints(root[
"negative_points"], legacyPrompts.
negativePoints);
1219 if (!root[
"positive_x"].isNull() && !root[
"positive_y"].isNull()) {
1220 cv::Point2f point(root[
"positive_x"].asFloat(), root[
"positive_y"].asFloat());
1224 if (!root[
"negative_x"].isNull() && !root[
"negative_y"].isNull()) {
1225 cv::Point2f point(root[
"negative_x"].asFloat(), root[
"negative_y"].asFloat());
1229 if (!root[
"rect_x1"].isNull() && !root[
"rect_y1"].isNull() &&
1230 !root[
"rect_x2"].isNull() && !root[
"rect_y2"].isNull()) {
1232 rect[
"x1"] = root[
"rect_x1"];
1233 rect[
"y1"] = root[
"rect_y1"];
1234 rect[
"x2"] = root[
"rect_x2"];
1235 rect[
"y2"] = root[
"rect_y2"];
1236 cv::Rect_<float> parsed;
1237 if (RectFromJson(rect, parsed))
1241 promptKeyframes[1] = legacyPrompts;