#pragma once

////////////////////////////////////////////////////////////////////////////////
// The MIT License (MIT)
//
// Copyright (c) 2017 Nicholas Frechette & Animation Compression Library contributors
// Copyright (c) 2018 Nicholas Frechette & Realtime Math contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
////////////////////////////////////////////////////////////////////////////////

#include "rtm/math.h"

#include <cstdint>

namespace rtm
{
#if defined(RTM_SSE2_INTRINSICS)
	//////////////////////////////////////////////////////////////////////////
	// A quaternion (4D complex number) where the imaginary part is the [w] component.
	// It accurately represents a 3D rotation with no gimbal lock as long as it is kept normalized.
	//////////////////////////////////////////////////////////////////////////
	using quatf = __m128;

	//////////////////////////////////////////////////////////////////////////
	// A quaternion (4D complex number) where the imaginary part is the [w] component.
	// It accurately represents a 3D rotation with no gimbal lock as long as it is kept normalized.
	//////////////////////////////////////////////////////////////////////////
	struct quatd
	{
		__m128d xy;
		__m128d zw;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4D vector.
	//////////////////////////////////////////////////////////////////////////
	using vector4f = __m128;

	//////////////////////////////////////////////////////////////////////////
	// A 4D vector.
	//////////////////////////////////////////////////////////////////////////
	struct vector4d
	{
		__m128d xy;
		__m128d zw;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask for 32 bit floats: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	using mask4f = __m128;

	//////////////////////////////////////////////////////////////////////////
	// A 4x64 bit vector comparison mask for 64 bit floats: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct mask4d
	{
		__m128d xy;
		__m128d zw;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	using mask4i = __m128i;

	//////////////////////////////////////////////////////////////////////////
	// A 4x64 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct mask4q
	{
		__m128i xy;
		__m128i zw;
	};
#elif defined(RTM_NEON_INTRINSICS)
	//////////////////////////////////////////////////////////////////////////
	// A quaternion (4D complex number) where the imaginary part is the [w] component.
	// It accurately represents a 3D rotation with no gimbal lock as long as it is kept normalized.
	//////////////////////////////////////////////////////////////////////////
	using quatf = float32x4_t;

	//////////////////////////////////////////////////////////////////////////
	// A quaternion (4D complex number) where the imaginary part is the [w] component.
	// It accurately represents a 3D rotation with no gimbal lock as long as it is kept normalized.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) quatd
	{
		double x;
		double y;
		double z;
		double w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4D vector.
	//////////////////////////////////////////////////////////////////////////
	using vector4f = float32x4_t;

	//////////////////////////////////////////////////////////////////////////
	// A 4D vector.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) vector4d
	{
		double x;
		double y;
		double z;
		double w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask for 32 bit floats: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	using mask4f = float32x4_t;

	//////////////////////////////////////////////////////////////////////////
	// A 4x64 bit vector comparison mask for 64 bit floats: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4d
	{
		uint64_t x;
		uint64_t y;
		uint64_t z;
		uint64_t w;
	};

#if defined(_MSC_VER)
	// MSVC uses a simple typedef to an identical underlying type for uint32x4_t and float32x4_t
	// To avoid issues of duplicate symbols, we introduce a concrete type

	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4i
	{
		uint32x4_t value;
	};

	// Helper macros to simplify usage
	#define RTM_IMPL_MASK4i_GET(mask) mask.value
	#define RTM_IMPL_MASK4i_SET(mask) mask4i{ mask }
#else
	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	using mask4i = uint32x4_t;

	// Helper macros to simplify usage
	#define RTM_IMPL_MASK4i_GET(mask) mask
	#define RTM_IMPL_MASK4i_SET(mask) mask
#endif

	//////////////////////////////////////////////////////////////////////////
	// A 4x64 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4q
	{
		uint64_t x;
		uint64_t y;
		uint64_t z;
		uint64_t w;
	};
#else
	//////////////////////////////////////////////////////////////////////////
	// A quaternion (4D complex number) where the imaginary part is the [w] component.
	// It accurately represents a 3D rotation with no gimbal lock as long as it is kept normalized.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) quatf
	{
		float x;
		float y;
		float z;
		float w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A quaternion (4D complex number) where the imaginary part is the [w] component.
	// It accurately represents a 3D rotation with no gimbal lock as long as it is kept normalized.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) quatd
	{
		double x;
		double y;
		double z;
		double w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4D vector.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) vector4f
	{
		float x;
		float y;
		float z;
		float w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4D vector.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) vector4d
	{
		double x;
		double y;
		double z;
		double w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask for 32 bit floats: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4f
	{
		uint32_t x;
		uint32_t y;
		uint32_t z;
		uint32_t w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x64 bit vector comparison mask for 64 bit floats: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4d
	{
		uint64_t x;
		uint64_t y;
		uint64_t z;
		uint64_t w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x32 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4i
	{
		uint32_t x;
		uint32_t y;
		uint32_t z;
		uint32_t w;
	};

	//////////////////////////////////////////////////////////////////////////
	// A 4x64 bit vector comparison mask: ~0 if true, 0 otherwise.
	//////////////////////////////////////////////////////////////////////////
	struct alignas(16) mask4q
	{
		uint64_t x;
		uint64_t y;
		uint64_t z;
		uint64_t w;
	};
#endif

#if defined(RTM_SSE2_INTRINSICS)
	// With SSE2, we use a concrete type for scalarf/scalard unlike other platforms and other types
	// like vector4f and quatf. We don't use a concrete type when we can avoid it to help the compiler
	// optimize as much as possible. But we must be able to tell a scalar apart from a vector for
	// return type overloading and argument overloading.
	// For example, we want to support vector_mul(vec4, vec4) and vector_mul(vec4, scalar).
	// When scalarf is a 'float', the type is distinct and everything works as expected
	// but if we use __m128, the type is the same as vector4f and we won't be able to tell
	// them apart.
	// Another example is vector_dot where we want to support returning a float, a scalarf, and
	// a vector4f depending on what the user expects. We could always return a float/scalarf but
	// if we need a vector4f it is less efficient if _mm_dp_ps is used: we would have an extra
	// shuffle.
	// Using a concrete type here allows us to tell the types apart and properly overload them
	// when required. The compiler should still be able to optimize properly.

	//////////////////////////////////////////////////////////////////////////
	// A SIMD friendly scalar type. Different architectures have an easier or harder time
	// working with scalar floating point numbers. For example, older PowerPC processors
	// had to write to memory and reload from it to transfer from one register file into
	// another (e.g convert from a float to a SIMD vector). Modern processors handle
	// this much better but inefficiencies remain, especially with SSE. While it is
	// free to convert a SIMD scalar into a float with _mm_cvtss_f32(..) the reverse generally
	// requires the compiler to fill the unused SIMD lanes with known values (either zero or the same).
	// This introduces an extra instruction that isn't always required when only the first lane is used
	// such as with scalar_sqrt_reciprocal(..). By introducing a type for SIMD scalar values,
	// each platform is free to make an optimal choice.
	//////////////////////////////////////////////////////////////////////////
	struct scalarf
	{
		__m128 value;
	};

	//////////////////////////////////////////////////////////////////////////
	// A SIMD friendly scalar type. Different architectures have an easier or harder time
	// working with scalar floating point numbers. For example, older PowerPC processors
	// had to write to memory and reload from it to transfer from one register file into
	// another (e.g convert from a float to a SIMD vector). Modern processors handle
	// this much better but inefficiencies remain, especially with SSE. While it is
	// free to convert a SIMD scalar into a float with _mm_cvtss_f32(..) the reverse generally
	// requires the compiler to fill the unused SIMD lanes with known values (either zero or the same).
	// This introduces an extra instruction that isn't always required when only the first lane is used
	// such as with scalar_sqrt_reciprocal(..). By introducing a type for SIMD scalar values,
	// each platform is free to make an optimal choice.
	//////////////////////////////////////////////////////////////////////////
	struct scalard
	{
		__m128d value;
	};
#else
	//////////////////////////////////////////////////////////////////////////
	// A SIMD friendly scalar type. Different architectures have an easier or harder time
	// working with scalar floating point numbers. For example, older PowerPC processors
	// had to write to memory and reload from it to transfer from one register file into
	// another (e.g convert from a float to a SIMD vector). Modern processors handle
	// this much better but inefficiencies remain, especially with SSE. While it is
	// free to convert a SIMD scalar into a float with _mm_cvtss_f32(..) the reverse generally
	// requires the compiler to fill the unused SIMD lanes with known values (either zero or the same).
	// This introduces an extra instruction that isn't always required when only the first lane is used
	// such as with scalar_sqrt_reciprocal(..). By introducing a type for SIMD scalar values,
	// each platform is free to make an optimal choice.
	//////////////////////////////////////////////////////////////////////////
	using scalarf = float;

	//////////////////////////////////////////////////////////////////////////
	// A SIMD friendly scalar type. Different architectures have an easier or harder time
	// working with scalar floating point numbers. For example, older PowerPC processors
	// had to write to memory and reload from it to transfer from one register file into
	// another (e.g convert from a float to a SIMD vector). Modern processors handle
	// this much better but inefficiencies remain, especially with SSE. While it is
	// free to convert a SIMD scalar into a float with _mm_cvtss_f32(..) the reverse generally
	// requires the compiler to fill the unused SIMD lanes with known values (either zero or the same).
	// This introduces an extra instruction that isn't always required when only the first lane is used
	// such as with scalar_sqrt_reciprocal(..). By introducing a type for SIMD scalar values,
	// each platform is free to make an optimal choice.
	//////////////////////////////////////////////////////////////////////////
	using scalard = double;
#endif

	//////////////////////////////////////////////////////////////////////////
	// A QVV transform represents a 3D rotation (quaternion), 3D translation (vector), and 3D scale (vector).
	// It properly handles positive scaling but negative scaling is a bit more problematic.
	// A best effort is made by converting the quaternion to a matrix during those operations.
	// If scale fidelity is important, consider using an affine matrix 3x4 instead.
	//////////////////////////////////////////////////////////////////////////
	struct qvvf
	{
		quatf		rotation;
		vector4f	translation;
		vector4f	scale;
	};

	//////////////////////////////////////////////////////////////////////////
	// A QVV transform represents a 3D rotation (quaternion), 3D translation (vector), and 3D scale (vector).
	// It properly handles positive scaling but negative scaling is a bit more problematic.
	// A best effort is made by converting the quaternion to a matrix during those operations.
	// If scale fidelity is important, consider using an affine matrix 3x4 instead.
	//////////////////////////////////////////////////////////////////////////
	struct qvvd
	{
		quatd		rotation;
		vector4d	translation;
		vector4d	scale;
	};

	//////////////////////////////////////////////////////////////////////////
	// A generic 3x3 matrix.
	// Note: The [w] component of every column vector is undefined.
	//////////////////////////////////////////////////////////////////////////
	struct matrix3x3f
	{
		vector4f	x_axis;
		vector4f	y_axis;
		vector4f	z_axis;
	};

	//////////////////////////////////////////////////////////////////////////
	// A generic 3x3 matrix.
	// Note: The [w] component of every column vector is undefined.
	//////////////////////////////////////////////////////////////////////////
	struct matrix3x3d
	{
		vector4d	x_axis;
		vector4d	y_axis;
		vector4d	z_axis;
	};

	//////////////////////////////////////////////////////////////////////////
	// An 3x4 affine matrix represents a 3D rotation, 3D translation, and 3D scale.
	// It properly deals with skew/shear when present but once scale with mirroring is combined,
	// it cannot be safely extracted back.
	//
	// Affine matrices are 4x4 but have their last row always equal to [0, 0, 0, 1] which is why it is 3x4.
	// Note: We do not track the implicit last row and it is thus undefined.
	//
	// Left handed coordinate system:
	// X axis == forward
	// Y axis == right
	// Z axis == up
	//////////////////////////////////////////////////////////////////////////
	struct matrix3x4f
	{
		vector4f	x_axis;
		vector4f	y_axis;
		vector4f	z_axis;
		vector4f	w_axis;
	};

	//////////////////////////////////////////////////////////////////////////
	// An 3x4 affine matrix represents a 3D rotation, 3D translation, and 3D scale.
	// It properly deals with skew/shear when present but once scale with mirroring is combined,
	// it cannot be safely extracted back.
	//
	// Affine matrices are 4x4 but have their last row always equal to [0, 0, 0, 1] which is why it is 3x4.
	// Note: We do not track the implicit last row and it is thus undefined.
	//
	// Left handed coordinate system:
	// X axis == forward
	// Y axis == right
	// Z axis == up
	//////////////////////////////////////////////////////////////////////////
	struct matrix3x4d
	{
		vector4d	x_axis;
		vector4d	y_axis;
		vector4d	z_axis;
		vector4d	w_axis;
	};

	//////////////////////////////////////////////////////////////////////////
	// A generic 4x4 matrix.
	//////////////////////////////////////////////////////////////////////////
	struct matrix4x4f
	{
		vector4f	x_axis;
		vector4f	y_axis;
		vector4f	z_axis;
		vector4f	w_axis;
	};

	//////////////////////////////////////////////////////////////////////////
	// A generic 4x4 matrix.
	//////////////////////////////////////////////////////////////////////////
	struct matrix4x4d
	{
		vector4d	x_axis;
		vector4d	y_axis;
		vector4d	z_axis;
		vector4d	w_axis;
	};

	//////////////////////////////////////////////////////////////////////////
	// Represents a component when mixing/shuffling/permuting vectors.
	// [xyzw] are used to refer to the first input while [abcd] refer to the second input.
	//////////////////////////////////////////////////////////////////////////
	enum class mix4
	{
		x = 0,
		y = 1,
		z = 2,
		w = 3,

		a = 4,
		b = 5,
		c = 6,
		d = 7,
	};

	//////////////////////////////////////////////////////////////////////////
	// Represents an axis in 3D.
	//////////////////////////////////////////////////////////////////////////
	enum class axis3
	{
		x = 0,
		y = 1,
		z = 2,
	};

	//////////////////////////////////////////////////////////////////////////
	// Represents an axis in 4D.
	//////////////////////////////////////////////////////////////////////////
	enum class axis4
	{
		x = 0,
		y = 1,
		z = 2,
		w = 3,
	};


	//////////////////////////////////////////////////////////////////////////
	// Various unaligned types suitable for interop. with GPUs, etc.
	//////////////////////////////////////////////////////////////////////////


	struct float2f
	{
		float x;
		float y;
	};

	struct float3f
	{
		float x;
		float y;
		float z;
	};

	struct float4f
	{
		float x;
		float y;
		float z;
		float w;
	};

	struct float2d
	{
		double x;
		double y;
	};

	struct float3d
	{
		double x;
		double y;
		double z;
	};

	struct float4d
	{
		double x;
		double y;
		double z;
		double w;
	};
}

// Always include the register passing typedefs
#include "rtm/impl/type_args.h"