#!/usr/bin/env perl
# Copyright (c) 2023, Google Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

use strict;

my $flavour = shift;
my $output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";

open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT = *OUT;

my ($rp, $ap, $bp, $num) = ("x0", "x1", "x2", "x3");
my ($a0, $a1, $b0, $b1, $num_pairs) = ("x4", "x5", "x6", "x7", "x8");
my $code = <<____;
#include <openssl/arm_arch.h>

.text

// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
//                       size_t num);
.type	bn_add_words, %function
.globl	bn_add_words
.align	4
bn_add_words:
	AARCH64_VALID_CALL_TARGET
	# Clear the carry flag.
	cmn	xzr, xzr

	# aarch64 can load two registers at a time, so we do two loop iterations at
	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
	# operations to use CBNZ without clobbering the carry flag.
	lsr	$num_pairs, $num, #1
	and	$num, $num, #1

	cbz	$num_pairs, .Ladd_tail
.Ladd_loop:
	ldp	$a0, $a1, [$ap], #16
	ldp	$b0, $b1, [$bp], #16
	sub	$num_pairs, $num_pairs, #1
	adcs	$a0, $a0, $b0
	adcs	$a1, $a1, $b1
	stp	$a0, $a1, [$rp], #16
	cbnz	$num_pairs, .Ladd_loop

.Ladd_tail:
	cbz	$num, .Ladd_exit
	ldr	$a0, [$ap], #8
	ldr	$b0, [$bp], #8
	adcs	$a0, $a0, $b0
	str	$a0, [$rp], #8

.Ladd_exit:
	cset	x0, cs
	ret
.size	bn_add_words,.-bn_add_words

// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
//                       size_t num);
.type	bn_sub_words, %function
.globl	bn_sub_words
.align	4
bn_sub_words:
	AARCH64_VALID_CALL_TARGET
	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
	# so we want C = 1 here.
	cmp	xzr, xzr

	# aarch64 can load two registers at a time, so we do two loop iterations at
	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
	# operations to use CBNZ without clobbering the carry flag.
	lsr	$num_pairs, $num, #1
	and	$num, $num, #1

	cbz	$num_pairs, .Lsub_tail
.Lsub_loop:
	ldp	$a0, $a1, [$ap], #16
	ldp	$b0, $b1, [$bp], #16
	sub	$num_pairs, $num_pairs, #1
	sbcs	$a0, $a0, $b0
	sbcs	$a1, $a1, $b1
	stp	$a0, $a1, [$rp], #16
	cbnz	$num_pairs, .Lsub_loop

.Lsub_tail:
	cbz	$num, .Lsub_exit
	ldr	$a0, [$ap], #8
	ldr	$b0, [$bp], #8
	sbcs	$a0, $a0, $b0
	str	$a0, [$rp], #8

.Lsub_exit:
	cset x0, cc
	ret
size	bn_sub_words,.-bn_sub_words
____

print $code;
close STDOUT or die "error closing STDOUT: $!";
