/*
 * Copyright 2008 Sony Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the names of the copyright holders nor the names of their
 *     contributors may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "e_cell.h"
#include "e_cell_err.h"

#include "libspe2_runtime.h"

#include <stdlib.h>
#include <string.h>
#include <unistd.h>

//#define ENABLE_PPE_TO_PPE 1 /* this option increases CPU consumption */

#ifdef __PPU__
#include <ppu_intrinsics.h>
#else /* !__PPU__ */
static inline uint32_t __lwarx(const void *p)
	{
	uint32_t r;
	asm volatile("lwarx %[r],0,%[p]" : [r]"=r"(r) : [p]"b"(p));
	return r;
	}

static inline uint32_t __stwcx(void *p, uint32_t v)
	{
	uint32_t r;
	asm volatile("stwcx. %[v],0,%[p]\n"
		"mfcr %[r]" : [r]"=r"(r) : [p]"b"(p), [v]"r"(v));
	return r & 0x20000000;
	}

static inline void __lwsync(void)
	{
	asm volatile("lwsync");
	}
#endif /* __PPU__ */


#define GET_QUEUE_BUFFER(q) ((void *)(uintptr_t)((q)->buffer))
#define GET_QUEUE_ELM(q, n) (GET_QUEUE_BUFFER(q) + (q)->aligned_elm_size * (n))

#define SET_QUEUE_BUFFER(q, b) ((q)->buffer = (uintptr_t)(b))

#define SPIN_COUNT_MAX (1 << 5)


uint64_t cell_queue_create(spe_task_t *task, int elm_size, int depth, int from_spe_to_ppe)
	{
	cell_queue_t *q;
	void *ptr;
	size_t  buffer_size;

	q = aligned_malloc(SHARED_ATOMIC_DATA_ALIGN, sizeof(*q));
	if (!q)
		{
		CELLerr(CELL_F_CELL_QUEUE_CREATE, CELL_R_MEMORY_ALLOCATION_FAILED);
		return 0;
		}
  
	memset(q, 0, sizeof(*q));

	q->elm_size = elm_size;
	q->aligned_elm_size = ALIGN_CEIL(elm_size, SHARED_DATA_ALIGN);
	q->depth = depth;

	buffer_size = q->aligned_elm_size * depth;
	ptr = aligned_malloc(SHARED_DATA_ALIGN, buffer_size);
	if (!ptr)
		{
		aligned_free(q);
		CELLerr(CELL_F_CELL_QUEUE_CREATE, CELL_R_MEMORY_ALLOCATION_FAILED);
		return 0;
		}
	SET_QUEUE_BUFFER(q, ptr);

	__lwsync(); /* sync system memory */

	return (uintptr_t)q;
	}

void cell_queue_destroy(uint64_t q_ea)
	{
	cell_queue_t *q = (cell_queue_t *)(uintptr_t)q_ea;
	void *buffer = GET_QUEUE_BUFFER(q);
	OPENSSL_cleanse(buffer, q->aligned_elm_size * q->depth);
	aligned_free(buffer);
	aligned_free(q);
	}

static int cell_queue_push_begin(cell_queue_t *q, void **ptr)
	{
	uint32_t status;
	uint32_t r_status, w_status;
	int spin_count = 0;
	int cont = 1;

	do
		{
		spin_count++;
		if (spin_count >= SPIN_COUNT_MAX)
			{
			spin_count = 0;
			sync_relax();
			}

		status = __lwarx(&q->status);
		r_status = CELL_QUEUE_UNPACK_R_STATUS(status);
		w_status = CELL_QUEUE_UNPACK_W_STATUS(status);

		if (!CELL_QUEUE_IS_LOCKED(w_status) /* no other thread writing to this queue */ &&
			CELL_QUEUE_POINTER(w_status) < CELL_QUEUE_POINTER(r_status) + q->depth)
			{

			w_status = CELL_QUEUE_SET_LOCKED(w_status); /* lock write status */
			status = CELL_QUEUE_PACK_STATUS(r_status, w_status);
			cont = !__stwcx(&q->status, status);
			}
		else
			{
			sync_wait(q, status);
			spin_count = 0;
			}
		} while (cont);

	/* pointer mod q->depth */
	w_status = CELL_QUEUE_POINTER(w_status);
	if (w_status >= q->depth) w_status -= q->depth;
	ASSERT(w_status < q->depth);

	*ptr = GET_QUEUE_ELM(q, w_status);

	return 1;
	}

static int cell_queue_try_push_begin(cell_queue_t *q, void **ptr)
	{
	uint32_t status;
	uint32_t r_status, w_status;
	int spin_count = 0;
	int cont = 1;

	do
		{
		spin_count++;
		if (spin_count >= SPIN_COUNT_MAX)
			{
			spin_count = 0;
			sync_relax();
			}

		status = __lwarx(&q->status);
		r_status = CELL_QUEUE_UNPACK_R_STATUS(status);
		w_status = CELL_QUEUE_UNPACK_W_STATUS(status);

		if (!CELL_QUEUE_IS_LOCKED(w_status) /* no other thread writing to this queue */ &&
			CELL_QUEUE_POINTER(w_status) < CELL_QUEUE_POINTER(r_status) + q->depth)
			{
			w_status = CELL_QUEUE_SET_LOCKED(w_status); /* lock write status */
			status = CELL_QUEUE_PACK_STATUS(r_status, w_status);
			cont = !__stwcx(&q->status, status);
			}
		else
			{
			/* no room to push more elements */
			return 0;
			}
		}
	while (cont);

	/* pointer mod q->depth */
	w_status = CELL_QUEUE_POINTER(w_status);
	if (w_status >= q->depth) w_status -= q->depth;
	ASSERT(w_status < q->depth);

	*ptr = GET_QUEUE_ELM(q, w_status);

	return 1;
	}

static int cell_queue_push_end(cell_queue_t *q)
	{
	uint32_t status;
	uint32_t r_status, w_status;
	int spin_count = 0;
	int notify;

	__lwsync(); /* sync system memory */

	/* finish */
	do
		{
		spin_count++;
		if (spin_count >= SPIN_COUNT_MAX)
			{
			spin_count = 0;
			sync_relax();
			}

		status = __lwarx(&q->status);
		r_status = CELL_QUEUE_UNPACK_R_STATUS(status);
		w_status = CELL_QUEUE_UNPACK_W_STATUS(status);

		notify = (CELL_QUEUE_POINTER(w_status) == CELL_QUEUE_POINTER(r_status));

		/* unlock write status and increment write pointer */
		w_status = CELL_QUEUE_POINTER(w_status) + 1;
		if (CELL_QUEUE_POINTER(w_status) >= q->depth &&
			CELL_QUEUE_POINTER(r_status) >= q->depth)
			{
			w_status -= q->depth;
			r_status -= q->depth;
			}

		status = CELL_QUEUE_PACK_STATUS(r_status, w_status);
		} while (!__stwcx(&q->status, status));

#ifdef ENABLE_PPE_TO_PPE
	if (notify)
		{
		sync_notify(q);
		}
#endif /* ENABLE_PPE_TO_PPE */

	return 1;
	}

int cell_queue_push(uint64_t q_ea, const void *buffer)
	{
	cell_queue_t *q = (cell_queue_t *)(uintptr_t)q_ea;
	void *elm;

	if (!cell_queue_push_begin(q, &elm))
		{
		return 0;
		}

	/* copy data */
	memcpy(elm, buffer, q->elm_size);

	return cell_queue_push_end(q);
	}

int cell_queue_try_push(uint64_t q_ea, const void *buffer)
	{
	cell_queue_t *q = (cell_queue_t *)(uintptr_t)q_ea;
	void *elm;

	if (!cell_queue_try_push_begin(q, &elm))
		{
		return 0;
		}

	/* copy data */
	memcpy(elm, buffer, q->elm_size);

	return cell_queue_push_end(q);
	}

static int cell_queue_pop_begin(cell_queue_t *q,  const void **elm)
	{
	uint32_t status;
	uint32_t r_status, w_status;
	int spin_count = 0;
	int cont = 1;

	do
		{
		spin_count++;
		if (spin_count >= SPIN_COUNT_MAX)
			{
			spin_count = 0;
			sync_relax();
			}

		status = __lwarx(&q->status);
		r_status = CELL_QUEUE_UNPACK_R_STATUS(status);
		w_status = CELL_QUEUE_UNPACK_W_STATUS(status);

		if (!CELL_QUEUE_IS_LOCKED(r_status) /* no other thread reading from this queue */ &&
			CELL_QUEUE_POINTER(r_status) < CELL_QUEUE_POINTER(w_status))
			{
			r_status = CELL_QUEUE_SET_LOCKED(r_status); /* lock read status */
			status = CELL_QUEUE_PACK_STATUS(r_status, w_status);
			cont = !__stwcx(&q->status, status);
			}
		else
			{
			sync_wait(q, status);
			spin_count = 0;
			}
		} while (cont);

	/* pointer mod q->depth */
	r_status = CELL_QUEUE_POINTER(r_status);
	if (r_status >= q->depth) r_status -= q->depth;
	ASSERT(r_status < q->depth);

	*elm = GET_QUEUE_ELM(q, r_status);

	return 1;
	}

static int cell_queue_try_pop_begin(cell_queue_t *q,  const void **elm)
	{
	uint32_t status;
	uint32_t r_status, w_status;
	int spin_count = 0;
	int cont = 1;

	do
		{
		spin_count++;
		if (spin_count >= SPIN_COUNT_MAX)
			{
			spin_count = 0;
			sync_relax();
			}

		status = __lwarx(&q->status);
		r_status = CELL_QUEUE_UNPACK_R_STATUS(status);
		w_status = CELL_QUEUE_UNPACK_W_STATUS(status);

		if (!CELL_QUEUE_IS_LOCKED(r_status) /* no other thread reading from this queue */ &&
			CELL_QUEUE_POINTER(r_status) < CELL_QUEUE_POINTER(w_status))
			{
			r_status = CELL_QUEUE_SET_LOCKED(r_status); /* lock read status */
			status = CELL_QUEUE_PACK_STATUS(r_status, w_status);
			cont = !__stwcx(&q->status, status);
			}
		else
			{
			/* no elements */
			return 0;
			}
		} while (cont);

	/* pointer mod q->depth */
	r_status = CELL_QUEUE_POINTER(r_status);
	if (r_status >= q->depth) r_status -= q->depth;
	ASSERT(r_status < q->depth);

	*elm = GET_QUEUE_ELM(q, r_status);

	return 1;
	}

static int cell_queue_pop_end(cell_queue_t *q)
	{
	uint32_t status;
	uint32_t r_status, w_status;
	int spin_count = 0;
	int notify;

	__lwsync(); /* sync system memory */

	/* finish */
	do
		{
		spin_count++;
		if (spin_count >= SPIN_COUNT_MAX)
			{
			spin_count = 0;
			sync_relax();
			}

		status = __lwarx(&q->status);
		r_status = CELL_QUEUE_UNPACK_R_STATUS(status);
		w_status = CELL_QUEUE_UNPACK_W_STATUS(status);

		notify = (CELL_QUEUE_POINTER(w_status) == CELL_QUEUE_POINTER(r_status) + q->depth);

		/* unlock read status and increment read pointer */
		r_status = CELL_QUEUE_POINTER(r_status) + 1;
		if (CELL_QUEUE_POINTER(w_status) >= q->depth &&
			CELL_QUEUE_POINTER(r_status) >= q->depth)
			{
			w_status -= q->depth;
			r_status -= q->depth;
			}

		status = CELL_QUEUE_PACK_STATUS(r_status, w_status);
		} while (!__stwcx(&q->status, status));

#ifdef ENABLE_PPE_TO_PPE
	if (notify)
		{
		sync_notify(q);
		}
#endif /* ENABLE_PPE_TO_PPE */

	return 1;
	}

int cell_queue_pop(uint64_t q_ea, void *buffer)
	{
	cell_queue_t *q = (cell_queue_t *)(uintptr_t)q_ea;
	const void *elm;

	if (!cell_queue_pop_begin(q, &elm))
		{
		return 0;
		}

	/* copy data */
	memcpy(buffer, elm, q->elm_size);

	return cell_queue_pop_end(q);
	}

int cell_queue_try_pop(uint64_t q_ea, void *buffer)
	{
	cell_queue_t *q = (cell_queue_t *)(uintptr_t)q_ea;
	const void *elm;

	if (!cell_queue_try_pop_begin(q, &elm))
		{
		return 0;
		}

	/* copy data */
	memcpy(buffer, elm, q->elm_size);

	return cell_queue_pop_end(q);
	}
