# [fpc-pascal] Functions for dealing with floating-point precision

Thomas Kurz fpc.2021 at t-net.ruhr
Sun Jun 19 22:38:14 CEST 2022

```Hello,

in my program I have need for checking floating-point precision. I'm internally using floating-points for calculations, but in the end I have to use integer numbers. I cannot use Round() because I have to check for thresholds. I.e. that I wish to accept a value of 1.9999.... as being ">=2". As you can see, I cannot use Trunc() either, so I decided to check for the difference in ULPs (unit in the last place).

I couldn't find the corresponding functions in the RTL, but I think they should be included. I wrote the functions listed below. I would appreciate if the maintainers of FPC decide to add them to the Math unit. I skipped the functions for the Extended type, but if it's being desired, I can add hand them in later.

Kind regards,
Thomas

*** code:

const MAX_ULP_SINGLE = 2.028240960E+31;
const MAX_ULP_DOUBLE = 1.9958403095347198E+292;

function SingleToLongBits (Argument: Single): UInt32;
begin
Move (Argument, Result, SizeOf(UInt32));
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function DoubleToLongBits (Argument: Double): UInt64;
begin
Move (Argument, Result, SizeOf(UInt64));
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

function LongBitsToSingle (Argument: UInt32): Single;
begin
Move (Argument, Result, SizeOf(Single));
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function LongBitsToDouble (Argument: UInt64): Double;
begin
Move (Argument, Result, SizeOf(Double));
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

// returns the first floating-point argument with the sign of the second floating-point argument
function CopySign (x: Single; y: Single): Single;
begin
Exit (Abs(x) * Sign(y));
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
// returns the first floating-point argument with the sign of the second floating-point argument
function CopySign (x: Double; y: Double): Double;
begin
Exit (Abs(x) * Sign(y));
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

function IsSameSign (x: Single; y: Single): Boolean;
begin
Exit (CopySign(x, y) = x);
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function IsSameSign (x: Double; y: Double): Boolean;
begin
Exit (CopySign(x, y) = x);
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

function NextAfter (Start: Single; Direction: Single): Single;
var
absStart: Single;
absDir: Single;
toZero: Boolean;
begin
// If either argument is a NaN, then NaN is returned.
if IsNan(Start) OR IsNan(Direction) then Exit (NaN);
// If both arguments compare as equal the second argument is returned.
if (Start = Direction) THEN Exit (Direction);
absStart := Abs(Start);
absDir := Abs(Direction);
toZero := NOT IsSameSign (Start, Direction) OR (AbsDir < absStart);
if (toZero) then begin
// we are reducing the magnitude, going toward zero.
if (absStart = MinSingle) then Exit (CopySign (0.0, Start));
if IsInfinite(absStart) THEN Exit (CopySign (MaxSingle, Start));
Exit (CopySign (LongBitsToSingle (Pred(SingleToLongBits(absStart))), Start));
end else begin
// we are increasing the magnitude, toward +-Infinity
if (Start = 0.0) then Exit (CopySign (MinSingle, Direction));
if (absStart = MaxSingle) then Exit (CopySign (Infinity, Start));
Exit (CopySign (LongBitsToSingle(Succ(SingleToLongBits(absStart))), Start));
end;
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function NextAfter (Start: Double; Direction: Double): Double;
var
absStart: Double;
absDir: Double;
toZero: Boolean;
begin
// If either argument is a NaN, then NaN is returned.
if IsNan(Start) OR IsNan(Direction) then Exit (NaN);
// If both arguments compare as equal the second argument is returned.
if (Start = Direction) THEN Exit (Direction);
absStart := Abs(Start);
absDir := Abs(Direction);
toZero := NOT IsSameSign (Start, Direction) OR (AbsDir < AbsStart);

if (toZero) then begin
// we are reducing the magnitude, going toward zero.
if (absStart = MinDouble) then Exit (CopySign (0.0, Start));
if IsInfinite(absStart) THEN Exit (CopySign (MaxDouble, Start));
Exit (CopySign (LongBitsToDouble (Pred(DoubleToLongBits(absStart))), Start));
end else begin
// we are increasing the magnitude, toward +-Infinity
if (Start = 0.0) then Exit (CopySign (MinDouble, Direction));
if (AbsStart = MaxDouble) then Exit (CopySign (Infinity, Start));
Exit (CopySign (LongBitsToDouble(Succ(DoubleToLongBits(absStart))), start));
end;
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

function FloatPrior (Argument: Single): Single;
begin
Exit (NextAfter (Argument, NegInfinity));
end;

function FloatNext (Argument: Single): Single;
begin
Exit (NextAfter (Argument, Infinity));
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function FloatPrior (Argument: Double): Double;
begin
Exit (NextAfter (Argument, NegInfinity));
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function FloatNext (Argument: Double): Double;
begin
Exit (NextAfter (Argument, Infinity));
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

function ULP (Argument: Single): Single;
begin
// If the argument is NaN, then the result is NaN.
if IsNaN (Argument) then Exit (NaN);
// If the argument is positive or negative infinity, then the result is positive infinity.
if IsInfinite (Argument) then Exit (Infinity);
// If the argument is positive or negative zero, then the result is Single.MIN_VALUE.
if (Argument = 0.0) then Exit (MinSingle);
Argument := Abs(Argument);
// If the argument is ±Single.MAX_VALUE, then the result is equal to 2^104.
if (Argument = MaxSingle) then Exit (MAX_ULP_SINGLE);
Exit (FloatNext(Argument) - Argument);
end;

{\$ifdef FPC_HAS_TYPE_DOUBLE}
function ULP (Argument: Double): Double;
begin
// If the argument is NaN, then the result is NaN.
if IsNaN (Argument) then Exit (NaN);
// If the argument is positive or negative infinity, then the result is positive infinity.
if IsInfinite (Argument) then Exit (Infinity);
// If the argument is positive or negative zero, then the result is Double.MIN_VALUE.
if (Argument = 0.0) then Exit (MinDouble);
Argument := Abs(Argument);
// If the argument is ±Double.MAX_VALUE, then the result is equal to 2^971.
if (Argument = MaxDouble) then Exit (MAX_ULP_DOUBLE);
Exit (FloatNext(Argument) - Argument);
end;
{\$endif FPC_HAS_TYPE_DOUBLE}

```