/*
 * Copyright 2023 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#ifndef SkRasterPipelineOpContexts_DEFINED
#define SkRasterPipelineOpContexts_DEFINED

#include <algorithm>
#include <cstddef>
#include <cstdint>

namespace SkSL { class TraceHook; }

// The largest number of pixels we handle at a time. We have a separate value for the largest number
// of pixels we handle in the highp pipeline. Many of the context structs in this file are only used
// by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
// save memory in the arena.
inline static constexpr int SkRasterPipeline_kMaxStride = 16;
inline static constexpr int SkRasterPipeline_kMaxStride_highp = 16;

// How much space to allocate for each MemoryCtx scratch buffer, as part of tail-pixel handling.
inline static constexpr size_t SkRasterPipeline_MaxScratchPerPatch =
        std::max(SkRasterPipeline_kMaxStride_highp * 16,  // 16 == largest highp bpp (RGBA_F32)
                 SkRasterPipeline_kMaxStride * 4);        // 4 == largest lowp bpp (RGBA_8888)

// These structs hold the context data for many of the Raster Pipeline ops.
struct SkRasterPipeline_MemoryCtx {
    void* pixels;
    int   stride;
};

// Raster Pipeline typically processes N (4, 8, 16) pixels at a time, in SIMT fashion. If the
// number of pixels in a row isn't evenly divisible by N, there will be leftover pixels; this is
// called the "tail". To avoid reading or writing past the end of any source or destination buffers
// when we reach the tail:
//
//   1) Source buffers have their tail contents copied to a scratch buffer that is at least N wide.
//      In practice, each scratch buffer uses SkRasterPipeline_MaxScratchPerPatch bytes.
//   2) Each MemoryCtx in the pipeline is patched, such that access to them (at the current scanline
//      and x-offset) will land in the scratch buffer.
//   3) Pipeline is run as normal (with all memory access happening safely in the scratch buffers).
//   4) Destination buffers have their tail contents copied back from the scratch buffer.
//   5) Each MemoryCtx is "un-patched".
//
// To do all of this, the pipeline creates a MemoryCtxPatch for each unique MemoryCtx referenced by
// the pipeline.
struct SkRasterPipeline_MemoryCtxInfo {
    SkRasterPipeline_MemoryCtx* context;

    int bytesPerPixel;
    bool load;
    bool store;
};

struct SkRasterPipeline_MemoryCtxPatch {
    SkRasterPipeline_MemoryCtxInfo info;

    void* backup;  // Remembers context->pixels so we can restore it
    std::byte scratch[SkRasterPipeline_MaxScratchPerPatch];
};

struct SkRasterPipeline_GatherCtx {
    const void* pixels;
    int         stride;
    float       width;
    float       height;
    float       weights[16];  // for bicubic and bicubic_clamp_8888
    // Controls whether pixel i-1 or i is selected when floating point sample position is exactly i.
    bool        roundDownAtInteger = false;
};

// State shared by save_xy, accumulate, and bilinear_* / bicubic_*.
struct SkRasterPipeline_SamplerCtx {
    float      x[SkRasterPipeline_kMaxStride_highp];
    float      y[SkRasterPipeline_kMaxStride_highp];
    float     fx[SkRasterPipeline_kMaxStride_highp];
    float     fy[SkRasterPipeline_kMaxStride_highp];
    float scalex[SkRasterPipeline_kMaxStride_highp];
    float scaley[SkRasterPipeline_kMaxStride_highp];

    // for bicubic_[np][13][xy]
    float weights[16];
    float wx[4][SkRasterPipeline_kMaxStride_highp];
    float wy[4][SkRasterPipeline_kMaxStride_highp];
};

struct SkRasterPipeline_TileCtx {
    float scale;
    float invScale; // cache of 1/scale
    // When in the reflection portion of mirror tiling we need to snap the opposite direction
    // at integer sample points than when in the forward direction. This controls which way we bias
    // in the reflection. It should be 1 if SkRasterPipeline_GatherCtx::roundDownAtInteger is true
    // and otherwise -1.
    int   mirrorBiasDir = -1;
};

struct SkRasterPipeline_DecalTileCtx {
    uint32_t mask[SkRasterPipeline_kMaxStride];
    float    limit_x;
    float    limit_y;
    // These control which edge of the interval is included (i.e. closed interval at 0 or at limit).
    // They should be set to limit_x and limit_y if SkRasterPipeline_GatherCtx::roundDownAtInteger
    // is true and otherwise zero.
    float    inclusiveEdge_x = 0;
    float    inclusiveEdge_y = 0;
};

enum class SkPerlinNoiseShaderType;

struct SkRasterPipeline_PerlinNoiseCtx {
    SkPerlinNoiseShaderType noiseType;
    float baseFrequencyX, baseFrequencyY;
    float stitchDataInX, stitchDataInY;
    bool stitching;
    int numOctaves;
    const uint8_t* latticeSelector;  // [256 values]
    const uint16_t* noiseData;       // [4 channels][256 elements][vector of 2]
};

// State used by mipmap_linear_*
struct SkRasterPipeline_MipmapCtx {
    // Original coords, saved before the base level logic
    float x[SkRasterPipeline_kMaxStride_highp];
    float y[SkRasterPipeline_kMaxStride_highp];

    // Base level color
    float r[SkRasterPipeline_kMaxStride_highp];
    float g[SkRasterPipeline_kMaxStride_highp];
    float b[SkRasterPipeline_kMaxStride_highp];
    float a[SkRasterPipeline_kMaxStride_highp];

    // Scale factors to transform base level coords to lower level coords
    float scaleX;
    float scaleY;

    float lowerWeight;
};

struct SkRasterPipeline_CoordClampCtx {
    float min_x, min_y;
    float max_x, max_y;
};

struct SkRasterPipeline_CallbackCtx {
    void (*fn)(SkRasterPipeline_CallbackCtx* self,
               int active_pixels /*<= SkRasterPipeline_kMaxStride_highp*/);

    // When called, fn() will have our active pixels available in rgba.
    // When fn() returns, the pipeline will read back those active pixels from read_from.
    float rgba[4*SkRasterPipeline_kMaxStride_highp];
    float* read_from = rgba;
};

// state shared by stack_checkpoint and stack_rewind
struct SkRasterPipelineStage;

struct SkRasterPipeline_RewindCtx {
    float  r[SkRasterPipeline_kMaxStride_highp];
    float  g[SkRasterPipeline_kMaxStride_highp];
    float  b[SkRasterPipeline_kMaxStride_highp];
    float  a[SkRasterPipeline_kMaxStride_highp];
    float dr[SkRasterPipeline_kMaxStride_highp];
    float dg[SkRasterPipeline_kMaxStride_highp];
    float db[SkRasterPipeline_kMaxStride_highp];
    float da[SkRasterPipeline_kMaxStride_highp];
    std::byte* base;
    SkRasterPipelineStage* stage;
};

struct SkRasterPipeline_GradientCtx {
    size_t stopCount;
    float* fs[4];
    float* bs[4];
    float* ts;
};

struct SkRasterPipeline_EvenlySpaced2StopGradientCtx {
    float f[4];
    float b[4];
};

struct SkRasterPipeline_2PtConicalCtx {
    uint32_t fMask[SkRasterPipeline_kMaxStride_highp];
    float    fP0,
             fP1;
};

struct SkRasterPipeline_UniformColorCtx {
    float r,g,b,a;
    uint16_t rgba[4];  // [0,255] in a 16-bit lane.
};

struct SkRasterPipeline_EmbossCtx {
    SkRasterPipeline_MemoryCtx mul,
                               add;
};

struct SkRasterPipeline_TablesCtx {
    const uint8_t *r, *g, *b, *a;
};

using SkRPOffset = uint32_t;

struct SkRasterPipeline_InitLaneMasksCtx {
    uint8_t* tail;
};

struct SkRasterPipeline_ConstantCtx {
    int32_t value;
    SkRPOffset dst;
};

struct SkRasterPipeline_UniformCtx {
    int32_t* dst;
    const int32_t* src;
};

struct SkRasterPipeline_BinaryOpCtx {
    SkRPOffset dst;
    SkRPOffset src;
};

struct SkRasterPipeline_TernaryOpCtx {
    SkRPOffset dst;
    SkRPOffset delta;
};

struct SkRasterPipeline_MatrixMultiplyCtx {
    SkRPOffset dst;
    uint8_t leftColumns, leftRows, rightColumns, rightRows;
};

struct SkRasterPipeline_SwizzleCtx {
    // If we are processing more than 16 pixels at a time, an 8-bit offset won't be sufficient and
    // `offsets` will need to use uint16_t (or dial down the premultiplication).
    static_assert(SkRasterPipeline_kMaxStride_highp <= 16);

    SkRPOffset dst;
    uint8_t offsets[4];  // values must be byte offsets (4 * highp-stride * component-index)
};

struct SkRasterPipeline_ShuffleCtx {
    int32_t* ptr;
    int count;
    uint16_t offsets[16];  // values must be byte offsets (4 * highp-stride * component-index)
};

struct SkRasterPipeline_SwizzleCopyCtx {
    int32_t* dst;
    const int32_t* src;   // src values must _not_ overlap dst values
    uint16_t offsets[4];  // values must be byte offsets (4 * highp-stride * component-index)
};

struct SkRasterPipeline_CopyIndirectCtx {
    int32_t* dst;
    const int32_t* src;
    const uint32_t *indirectOffset;  // this applies to `src` or `dst` based on the op
    uint32_t indirectLimit;          // the indirect offset is clamped to this upper bound
    uint32_t slots;                  // the number of slots to copy
};

struct SkRasterPipeline_SwizzleCopyIndirectCtx : public SkRasterPipeline_CopyIndirectCtx {
    uint16_t offsets[4];  // values must be byte offsets (4 * highp-stride * component-index)
};

struct SkRasterPipeline_BranchCtx {
    int offset;  // contains the label ID during compilation, and the program offset when compiled
};

struct SkRasterPipeline_BranchIfAllLanesActiveCtx : public SkRasterPipeline_BranchCtx {
    uint8_t* tail = nullptr;  // lanes past the tail are _never_ active, so we need to exclude them
};

struct SkRasterPipeline_BranchIfEqualCtx : public SkRasterPipeline_BranchCtx {
    int value;
    const int* ptr;
};

struct SkRasterPipeline_CaseOpCtx {
    int expectedValue;
    SkRPOffset offset;  // points to a pair of adjacent I32s: {I32 actualValue, I32 defaultMask}
};

struct SkRasterPipeline_TraceFuncCtx {
    const int* traceMask;
    SkSL::TraceHook* traceHook;
    int funcIdx;
};

struct SkRasterPipeline_TraceScopeCtx {
    const int* traceMask;
    SkSL::TraceHook* traceHook;
    int delta;
};

struct SkRasterPipeline_TraceLineCtx {
    const int* traceMask;
    SkSL::TraceHook* traceHook;
    int lineNumber;
};

struct SkRasterPipeline_TraceVarCtx {
    const int* traceMask;
    SkSL::TraceHook* traceHook;
    int slotIdx, numSlots;
    const int* data;
    const uint32_t *indirectOffset;  // can be null; if set, an offset applied to `data`
    uint32_t indirectLimit;          // the indirect offset is clamped to this upper bound
};

#endif  // SkRasterPipelineOpContexts_DEFINED
