#!/usr/bin/env python3
""" S3FD Face detection plugin
https://arxiv.org/abs/1708.05237

Adapted from S3FD Port in FAN:
https://github.com/1adrianb/face-alignment
"""

import os
from scipy.special import logsumexp

import numpy as np

from lib.multithreading import MultiThread
from ._base import Detector, dlib, logger


class Detect(Detector):
    """ S3FD detector for face recognition """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "s3fd"
        self.target = (640, 640)  # Uses approx 4 GB of VRAM
        self.vram = 4096
        self.model = None

    def set_model_path(self):
        """ Load the s3fd model """
        model_path = os.path.join(self.cachepath, "s3fd.pb")
        if not os.path.exists(model_path):
            raise Exception("Error: Unable to find {}, reinstall "
                            "the lib!".format(model_path))
        logger.debug("Loading model: '%s'", model_path)
        return model_path

    def initialize(self, *args, **kwargs):
        """ Create the s3fd detector """
        super().initialize(*args, **kwargs)
        logger.info("Initializing S3FD Detector...")
        card_id, vram_free, vram_total = self.get_vram_free()
        if vram_free <= self.vram:
            tf_ratio = 1.0
        else:
            tf_ratio = self.vram / vram_total
        logger.verbose("Reserving %s%% of total VRAM per s3fd thread", round(tf_ratio, 2))

        confidence = self.config["confidence"] / 100
        self.model = S3fd(self.model_path, self.target, tf_ratio, card_id, confidence)

        if not self.model.is_gpu:
            alloc = 2048
            logger.warning("Using CPU")
        else:
            logger.debug("Using GPU")
            alloc = vram_free
        logger.debug("Allocated for Tensorflow: %sMB", alloc)

        self.batch_size = int(alloc / self.vram)

        if self.batch_size < 1:
            raise ValueError("Insufficient VRAM available to continue "
                             "({}MB)".format(int(alloc)))

        logger.verbose("Processing in %s threads", self.batch_size)

        self.init.set()
        logger.info("Initialized S3FD Detector.")

    def detect_faces(self, *args, **kwargs):
        """ Detect faces in Multiple Threads """
        super().detect_faces(*args, **kwargs)
        workers = MultiThread(target=self.detect_thread, thread_count=self.batch_size)
        workers.start()
        workers.join()
        sentinel = self.queues["in"].get()
        self.queues["out"].put(sentinel)
        logger.debug("Detecting Faces complete")

    def detect_thread(self):
        """ Detect faces in rgb image """
        logger.debug("Launching Detect")
        while True:
            item = self.get_item()
            if item == "EOF":
                break
            logger.trace("Detecting faces: '%s'", item["filename"])
            detect_image, scale = self.compile_detection_image(item["image"], True, False, False)
            for angle in self.rotation:
                current_image, rotmat = self.rotate_image(detect_image, angle)
                faces = self.model.detect_face(current_image)
                if angle != 0 and faces.any():
                    logger.verbose("found face(s) by rotating image %s degrees", angle)
                if faces.any():
                    break

            detected_faces = self.process_output(faces, rotmat, scale)
            item["detected_faces"] = detected_faces
            self.finalize(item)

        logger.debug("Thread Completed Detect")

    def process_output(self, faces, rotation_matrix, scale):
        """ Compile found faces for output """
        logger.trace("Processing Output: (faces: %s, rotation_matrix: %s)", faces, rotation_matrix)
        faces = [dlib.rectangle(  # pylint: disable=c-extension-no-member
            int(face[0]), int(face[1]), int(face[2]), int(face[3]))
                 for face in faces]
        if isinstance(rotation_matrix, np.ndarray):
            faces = [self.rotate_rect(face, rotation_matrix)
                     for face in faces]
        detected = [dlib.rectangle(  # pylint: disable=c-extension-no-member
            int(face.left() / scale),
            int(face.top() / scale),
            int(face.right() / scale),
            int(face.bottom() / scale))
                    for face in faces]
        logger.trace("Processed Output: %s", detected)
        return detected


class S3fd():
    """ Tensorflow Network """
    def __init__(self, model_path, target_size, vram_ratio, card_id, confidence):
        logger.debug("Initializing: %s: (model_path: '%s', target_size: %s, vram_ratio: %s, "
                     "card_id: %s)",
                     self.__class__.__name__, model_path, target_size, vram_ratio, card_id)
        # Must import tensorflow inside the spawned process for Windows machines
        import tensorflow as tf
        self.is_gpu = False
        self.tf = tf  # pylint: disable=invalid-name
        self.model_path = model_path
        self.confidence = confidence
        self.graph = self.load_graph()
        self.input = self.graph.get_tensor_by_name("s3fd/input_1:0")
        self.output = self.get_outputs()
        self.session = self.set_session(target_size, vram_ratio, card_id)
        logger.debug("Initialized: %s", self.__class__.__name__)

    def load_graph(self):
        """ Load the tensorflow Model and weights """
        # pylint: disable=not-context-manager
        logger.verbose("Initializing S3FD Network model...")
        with self.tf.gfile.GFile(self.model_path, "rb") as gfile:
            graph_def = self.tf.GraphDef()
            graph_def.ParseFromString(gfile.read())
        fa_graph = self.tf.Graph()
        with fa_graph.as_default():
            self.tf.import_graph_def(graph_def, name="s3fd")
        return fa_graph

    def get_outputs(self):
        """ Return the output tensors """
        tensor_names = ["concat_31", "transpose_72", "transpose_75", "transpose_78",
                        "transpose_81", "transpose_84", "transpose_87", "transpose_90",
                        "transpose_93", "transpose_96", "transpose_99", "transpose_102"]
        logger.debug("tensor_names: %s", tensor_names)
        tensors = [self.graph.get_tensor_by_name("s3fd/{}:0".format(t_name))
                   for t_name in tensor_names]
        logger.debug("tensors: %s", tensors)
        return tensors

    def set_session(self, target_size, vram_ratio, card_id):
        """ Set the TF Session and initialize """
        # pylint: disable=not-context-manager, no-member
        placeholder = np.zeros((1, 3, target_size[0], target_size[1]))
        config = self.tf.ConfigProto()
        if card_id != -1:
            config.gpu_options.visible_device_list = str(card_id)
        if vram_ratio != 1.0:
            config.gpu_options.per_process_gpu_memory_fraction = vram_ratio

        with self.graph.as_default():
            session = self.tf.Session(config=config)
            self.is_gpu = any("gpu" in str(device).lower() for device in session.list_devices())
            session.run(self.output, feed_dict={self.input: placeholder})
        return session

    def detect_face(self, feed_item):
        """ Detect faces """
        feed_item = feed_item - np.array([104.0, 117.0, 123.0])
        feed_item = feed_item.transpose(2, 0, 1)
        feed_item = feed_item.reshape((1,) + feed_item.shape).astype('float32')
        bboxlist = self.session.run(self.output, feed_dict={self.input: feed_item})
        bboxlist = self.post_process(bboxlist)

        keep = self.nms(bboxlist, 0.3)
        bboxlist = bboxlist[keep, :]
        bboxlist = [x for x in bboxlist if x[-1] >= self.confidence]

        return np.array(bboxlist)

    def post_process(self, bboxlist):
        """ Perform post processing on output """
        retval = list()
        for i in range(len(bboxlist) // 2):
            bboxlist[i * 2] = self.softmax(bboxlist[i * 2], axis=1)
        for i in range(len(bboxlist) // 2):
            ocls, oreg = bboxlist[i * 2], bboxlist[i * 2 + 1]
            stride = 2 ** (i + 2)    # 4,8,16,32,64,128
            poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
            for _, hindex, windex in poss:
                axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
                score = ocls[0, 1, hindex, windex]
                loc = np.ascontiguousarray(oreg[0, :, hindex, windex]).reshape((1, 4))
                priors = np.array([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
                variances = [0.1, 0.2]
                box = self.decode(loc, priors, variances)
                x_1, y_1, x_2, y_2 = box[0] * 1.0
                retval.append([x_1, y_1, x_2, y_2, score])
        retval = np.array(retval)
        if len(retval) == 0:
            retval = np.zeros((1, 5))
        return retval

    @staticmethod
    def softmax(inp, axis):
        """Compute softmax values for each sets of scores in x."""
        return np.exp(inp - logsumexp(inp, axis=axis, keepdims=True))

    @staticmethod
    def decode(loc, priors, variances):
        """Decode locations from predictions using priors to undo
        the encoding we did for offset regression at train time.
        Args:
            loc (tensor): location predictions for loc layers,
                Shape: [num_priors,4]
            priors (tensor): Prior boxes in center-offset form.
                Shape: [num_priors,4].
            variances: (list[float]) Variances of priorboxes
        Return:
            decoded bounding box predictions
        """
        boxes = np.concatenate((priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
                                priors[:, 2:] * np.exp(loc[:, 2:] * variances[1])),
                               1)
        boxes[:, :2] -= boxes[:, 2:] / 2
        boxes[:, 2:] += boxes[:, :2]
        return boxes

    @staticmethod
    def nms(dets, thresh):
        """ Perform Non-Maximum Suppression """
        keep = list()
        if len(dets) == 0:
            return keep

        x_1, y_1, x_2, y_2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
        areas = (x_2 - x_1 + 1) * (y_2 - y_1 + 1)
        order = scores.argsort()[::-1]

        keep = []
        while order.size > 0:
            i = order[0]
            keep.append(i)
            xx_1, yy_1 = np.maximum(x_1[i], x_1[order[1:]]), np.maximum(y_1[i], y_1[order[1:]])
            xx_2, yy_2 = np.minimum(x_2[i], x_2[order[1:]]), np.minimum(y_2[i], y_2[order[1:]])

            width, height = np.maximum(0.0, xx_2 - xx_1 + 1), np.maximum(0.0, yy_2 - yy_1 + 1)
            ovr = width * height / (areas[i] + areas[order[1:]] - width * height)

            inds = np.where(ovr <= thresh)[0]
            order = order[inds + 1]

        return keep