IEEE-754 (Floating Point Arithmetic

Kahan summation algorithm

The Kahan summation algorithm (aka compensated summation) reduces the numerical error when adding a series of finite precision floating point numbers.

The error reduction can be achieved by keeping a separate variable that stores a running compensation to accumulate small errors.

C / C++

The C99 and C++ 2003 standard only mandate that the double is at least as precise as the float. The rest is implementation defined.

represent-ieee-754.c

represent-ieee-754.c contains some simple C functions that allow to create a string with the binary representation of a double.

A 0 bit is generally represented with a dot.

An exponent-1 bit is represented with an E. A mantissa-1 bit is represented with a M. The negative sign bit is represented with a S.

#include <ieee754.h>

#include "represent-ieee-754.h"

struct D {
  unsigned char pos;
  unsigned char cnt;
  char*    representation;
};

void add_char(char c, struct D *d) {

  d->representation[d->pos] = c;
  d->pos++;

}

void add_repr_char(char c, struct D *d) {

  add_char(c, d);

  d->cnt++;

  if (! (d->cnt % 8)) {
    add_char(' ', d);
    add_char('|', d);
    add_char(' ', d);
  }
  else if (! (d->cnt % 4)) {
    add_char(' ', d);
  }


}

void represent_ieee_754_double(double d, ieee_754_double_representation r) {

  union ieee754_double dd;

  dd.d = d;
  struct D dr;
  dr.cnt=0;
  dr.pos=0;
  dr.representation = r;
//dr.representation[IEEE_755_DOUBLE_REPRESENTATION_SIZE] = 0x00;

  if (dd.ieee.negative) {
    add_repr_char('S', &dr);
  }
  else {
    add_repr_char('.', &dr);
  }

  for (unsigned i=1<<10; i >= 1; i>>=1) {
    if    (dd.ieee.exponent & i) { add_repr_char('E', &dr); }
    else                         { add_repr_char('.', &dr); }
  }


  for (unsigned long int i=1u<<19; i>=1 ; i>>=1) {
    if   (dd.ieee.mantissa0 & i) { add_repr_char('M', &dr); }
    else                         { add_repr_char('.', &dr); }
  }

  for (unsigned long int i = (1u<<31); i>= 1 ; i>>=1) {
    if   (dd.ieee.mantissa1 & i) { add_repr_char('M', &dr); }
    else                         { add_repr_char('.', &dr); }
  }
  add_char(0x00, &dr);

}

Github repository about-IEEE-754, path: /represent-ieee-754.c

The corresponding header file is

#define        IEEE_755_DOUBLE_REPRESENTATION_SIZE 104
typedef char   ieee_754_double_representation[IEEE_755_DOUBLE_REPRESENTATION_SIZE];
void represent_ieee_754_double(double d, ieee_754_double_representation r);

Github repository about-IEEE-754, path: /represent-ieee-754.h

print-some-doubles.c

print-some-doubles.c uses represent-ieee-754.c to print the binary representation of some doubes.

#include <math.h>
#include <float.h>
#include <stdio.h>

#include "represent-ieee-754.h"

void print_double_as_binary(double d) {

  ieee_754_double_representation r;
  represent_ieee_754_double(d, r);

  printf("%12.6f %s\n", d, r);

}

int main() { 


  print_double_as_binary(  0);
  print_double_as_binary( -0);
  print_double_as_binary(  1);
  print_double_as_binary(  0.5);
  print_double_as_binary(  1.0 + 0.5);
  print_double_as_binary(  1.0 + 0.5 + 0.25);
  print_double_as_binary(  1.0 + 0.5 + 0.25 + 0.125);
  print_double_as_binary(  1.0 + 0.5 + 0.25 + 0.125 + 0.125/2);
  print_double_as_binary(  2*(1.0 + 0.5 + 0.25 + 0.125 + 0.125/2));
  print_double_as_binary(  4*(1.0 + 0.5 + 0.25 + 0.125 + 0.125/2));
  print_double_as_binary(  1*(1.0 + 0.5 + 0.25 + 0.125 + 0.125/2) / pow(2, 1023));
  print_double_as_binary(  2);
  print_double_as_binary(  3);
  print_double_as_binary(  4);
  print_double_as_binary(  8);
  print_double_as_binary( 16);
  print_double_as_binary(- 1);
  print_double_as_binary( 10);
  print_double_as_binary( 1.5/pow(2, 1));
  print_double_as_binary( 1.5/pow(2, 2));
  print_double_as_binary( 1.5/pow(2, 3));
  print_double_as_binary( 250.25); // 11111010.01
  print_double_as_binary( DBL_MIN);
  print_double_as_binary( DBL_MIN * 2);
  print_double_as_binary(  NAN);
  print_double_as_binary(- NAN);
//print_double_as_binary( SNAN);
//print_double_as_binary(-SNAN);
  print_double_as_binary( INFINITY);
  print_double_as_binary(-INFINITY);
  print_double_as_binary( DBL_MAX);

  return 0;

}

Github repository about-IEEE-754, path: /print-some-doubles.c

The program prints

       +0.0000000000: ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.0000000000: ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +1.0000000000: ..EE  EEEE | EEEE  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.5000000000: ..EE  EEEE | EEE.  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +1.5000000000: ..EE  EEEE | EEEE  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +1.7500000000: ..EE  EEEE | EEEE  MM.. | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +1.8750000000: ..EE  EEEE | EEEE  MMM. | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +1.9375000000: ..EE  EEEE | EEEE  MMMM | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +3.8750000000: .E..  .... | ....  MMMM | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +7.7500000000: .E..  .... | ...E  MMMM | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.0000000000: ....  .... | ....  MMMM | M...  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +2.0000000000: .E..  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +3.0000000000: .E..  .... | ....  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +4.0000000000: .E..  .... | ...E  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +8.0000000000: .E..  .... | ..E.  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
      +16.0000000000: .E..  .... | ..EE  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       -1.0000000000: S.EE  EEEE | EEEE  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
      +10.0000000000: .E..  .... | ..E.  .M.. | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.7500000000: ..EE  EEEE | EEE.  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.3750000000: ..EE  EEEE | EE.E  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.1875000000: ..EE  EEEE | EE..  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
     +250.2500000000: .E..  .... | .EE.  MMMM | .M..  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.0000000000: ....  .... | ...E  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
       +0.0000000000: ....  .... | ..E.  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
                +nan: .EEE  EEEE | EEEE  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
                -nan: SEEE  EEEE | EEEE  M... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
                +inf: .EEE  EEEE | EEEE  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
                -inf: SEEE  EEEE | EEEE  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | ....  .... | 
      ....

Other formats

IEEE-854

Another standard for floating-point calculation is IEEE-854 (which allows for base 2 or 10 and does not specify how a floating-point number is encoded into bits).

8-bit formats used for deep learning/neural network

In 2022, two papers were published that introduced a format that allows to store a float in one byte aiming at improving the computational efficiency of machine learning and training of neural networks.

Although the float precision is much lower compared to traditional float formats, it was found that the training accuracy did not suffer too much.

These papers are

The first paper specifies

E4M3	1 bit for the sign, 4 bits for the exponents and 3 bits for the mantissa	Mostly used for weights
E5M2	1 bit for the sign, 3 bits for the exponents and 2 for the mantissa	Mostly used for the gradient

	`float`	`double`
Size	4 bytes	8 bytes
Mantissa bits	23 (24)	52 (53)
Decimal digitits of precision	7	15
Max number	3e38	1.7e308

IEEE-754 (Floating Point Arithmetic - float/double etc)