Dr. Dobb's | Varying Floating-Point Precision

Varying Floating-Point Precision

July 01, 1993
URL:http://drdobbs.com/varying-floating-point-precision/184402741

July 1993/Varying Floating-Point Precision/Listing 1

Listing 1 real_t.h — contains all the needed preprocessor gymnastics to make different precision versions of the same application

/* REAL_T.H */

#if !defined ( REAL_T_INCLUDED )
#define REAL_T_INCLUDED

/* Minimum numbers before sqrt()fails */
#define FLT_SQRT_MIN  1.0e-19F
#define DBL_SQRT_MIN  1.0e-154
#define LDBL_SQRT_MIN 1.0e-2466L

#if defined ( REAL_FLT )
   /* Type and constant macro for float */
   typedef float real_t;
   #define RC( x ) ( x##F )

   /* float.h constants */
   #define REAL_DIG        FLT_DIG
   #define REAL_EPSILON    FLT_EPSILON
   #define REAL_GUARD      FLT_GUARD
   #define REAL_MANT_DIG   FLT_MANT_DIG
   #define REAL_MAX        FLT_MAX
   #define REAL_MAX_10_EXP FLT_MAX_10_EXP
   #define REAL_MAX_EXP    FLT_MAX_EXP
   #define REAL_MIN        FLT_MIN
   #define REAL_MIN_10_EXP FLT_MIN_10_EXP
   #define REAL_MIN_EXP    FLT_MIN_EXP
   #define REAL_NORMALIZE  FLT_NORMALIZE
   #define REAL_RADIX      FLT_RADIX
   #define REAL_ROUNDS     FLT_ROUNDS
   #define REAL_SQRT_MIN   FLT_SQRT_MIN
   #if !defined ( REAL_MP )
   #undef DBL_DIG
   #undef DBL_EPSILON
   #undef DBL_MANT_DIG
   #undef DBL_MAX
   #undef DBL_MAX_10_EXP
   #undef DBL_MAX_EXP
   #undef DBL_MIN
   #undef DBL_MIN_10_EXP
   #undef DBL_MIN_EXP
   #undef DBL_RADIX
   #undef DBL_ROUNDS
   #undef DBL_SQRT_MIN
   #undef LDBL_DIG
   #undef LDBL_EPSILON
   #undef LDBL_MANT_DIG
   #undef LDBL_MAX
   #undef LDBL_MAX_10_EXP
   #under LDBL_MAX_EXP
   #undef LDBL_MIN
   #undef LDBL_MIN_10_EXP
   #undef LDBL_MIN_EXP
   #undef LDBL_RADIX
   #undef LDBL_ROUNDS
   #undef LDBL_SQRT_MIN
   #define DBL_DIG         REAL_DIG
   #define DBL_EPSILON     REAL_EPSILON
   #define DBL_MANT_DIG    REAL_MANT_DIG
   #define DBL_MAX         REAL_MAX
   #define DBL_MAX_10_EXP  REAL_MAX_10_EXP
   #define DBL_MAX_EXP     REAL_MAX_EXP
   #define DBL_MIN         REAL_MIN

   #define DBL_MIN_10_ EXP REAL_MIN_10_EXP
   #define DBL_MIN_EXP     REAL_MIN_EXP
   #define DBL_RADIX       REAL_RADIX
   #define DBL_ROUNDS      REAL_ROUNDS
   #define DBL_SQRT_MIN    REAL_SQRT_MIN
   #define LDBL_DIG        REAL_DIG
   #define LDBL_EPSILON    REAL_EPSILON
   #define LDBL_MANT_DIG   REAL_MANT_DIG
   #define LDBL_MAX        REAL_ MAX
   #define LDBL_MAX_10_EXP REAL_MAX_10_EXP
   #define LDBL_MAX_EXP    REAL_MAX_EXP
   #define LDBL_MIN        REAL_MIN
   #define LDBL_MIN_10_EXP REAL_MIN_10_EXP
   #define LDBL_MIN_EXP    REAL_MIN_EXP
   #define LDBL_RADIX      REAL_RADIX
   #define LDBL_ROUNDS     REAL_ROUNDS
   #define LDBL_SQRT_MIN   REAL_SQRT_MIN
   typedef long double long_double;
   #define long_double real_t
   #define double real_t
   #endif   /* #if !defined ( REAL_MP ) */

   /* math.h functions */
   #define acos( x ) ( (float)acos((double)(x) ) )
   #define asin( x ) ( (float)asin((double)(x) ) )
   #define atan( x ) ( (float)atan((double)(x) ) )
   #define atan2( y, x ) ( (float)atan2((double)(y), \
        (double)(x) ) )
   #define ceil( x ) ( (float)ceil( (double)(x) ) )
   #define cos( x ) ( (float)cos( (double)(x) ) )
   #define cosh( x ) ( (float)cosh( (double)(x) ) )
   #define exp( x ) ( (float)exp( (double)(x) ) )
   #define fabs( x ) ( (float)fabs( (double)(x) ) )
   #define floor( x ) ( (float)floor( (double)(x) ) )
   #define fmod( x, y ) ( (float)fmod( (double)(x), \
        (double)(y) ) )
   #define sin( x ) ( (float)sin( (double)(x) ) )
   #define sinh( x ) ( (float)sinh( (double)(x) ) )
   #define tan( x ) ( (float)tan( (double)(x) ) )
   #define tanh( x ) ( (float)tanh( (double)(x) ) )
   #if defined ( REAL_SAFE )
   #define atof( x )\
        ( ( (x) == NULL ) ? \
        RC( 0.0 ) : (float)atof( (x) ) )
   #define log( x ) \
        ( ( fabs( (x) ) ) < REAL_MIN ? \
        (float)log( (double)REAL_MIN) : \
        (float)log( (double)fabs( (x) ) ) )
   #define log10( x ) \
        ( ( fabs( x ) ) < REAL_MIN ? \
        (float)log10((double)REAL_MIN) : \
        (float)log10( (double)fabs( (x) ) ) )
   #define pow( x, y ) \
        ( fabs( (y) ) < REAL_MIN ? \
        RC ( 1.0 ) : \
        ( fabs( (x) ) < REAL MIN ? \
        ( (y)<RC(0.0) ? \
        REAL_MAX : RC( 0.0 ) ) : \
        (float)pow( (double)(x), (double)(y) ) ) )
   #define sqrt( x ) \
        ( fabs( (x) ) < REAL_SQRT_MIN ? \
        RC( 0.0 ) : (float)sqrt( double)fabs( (x) ) ) )
   #else
   #define atof( x ) ( (float)atof( (x) ) )
   #define log( x ) ( (float)log( (double)(x) ) )
   #define 1og10( x ) ( (float)log10( (double)(x) ) )
   #define pow( x, y ) ( (float)pow( (double)(x), \
        (double)(y) ) )
   #define sqrt( x ) ( (float)sqrt( (double)(x) ) )
   #endif   /* #if defined ( REAL_SAFE ) */

#else
#if defined ( REAL_LDBL )
   /* Type and constant macro for long double */
   typedef long double real_t;
   typedef long double long_double;
   #define RC( x ) ( x##L )

   /* float.h constants */
   #define REAL_DIG        LDBL_DIG
   #define REAL_EPSILON    LDBL_EPSILON
   #define REAL_MANT_DIG   LDBL_MANT_DIG
   #define REAL_MAX        LDBL_MAX
   #define REAL_MAX_10_EXP LDBL_MAX_10_EXP
   #define REAL_MAX_EXP    LDBL_MAX_EXP
   #define REAL_MIN        LDBL_MIN
   #define REAL_MIN_10_EXP LDBL_MIN_10_EXP
   #define REAL_MIN_EXP    LDBL_MIN_EXP
   #define REAL_RADIX      LDBL_RADIX
   #define REAL_ROUNDS     LDBL_ROUNDS
   #define REAL_SQRT_MIN   LDBL_SQRT_MIN
   #if !defined ( REAL_MP )
   #undef FLT_DIG
   #undef FLT_EPSILON
   #undef FLT_MANT_DIG
   #undef FLT_MAX
   #undef FLT_MAX_10_EXP
   #undef FLT_MAX_EXP
   #undef FLT_MIN
   #undef FLT_MIN_10_EXP
   #undef FLT_MIN_EXP
   #undef FLT_RADIX
   #undef FLT_ROUNDS
   #undef FLT_SQRT_MIN
   #undef DBL_DIG
   #undef DBL_EPSILON
   #undef DBL_MANT_DIG
   #undef DBL_MAX
   #undef DBL_MAX_10_EXP
   #undef DBL_MAX_EXP
   #undef DBL_MIN
   #undef DBL_MIN_10_EXP
   #undef DBL_MIN_EXP
   #undef DBL_RADIX
   #undef DBL_ROUNDS
   #undef DBL_SQRT_MIN
   #define DBL_DIG          REAL_DIG
   #define DBL_EPSILON      REAL_EPSILON
   #define DBL_MANT_DIG     REAL_MANT_DIG
   #define DBL_MAX          REAL_MAX
   #define DBL-MAX_10_EXP  REAL_MAX_10_EXP
   #define DBL_MAX_EXP     REAL_MAX_EXP
   #define DBL_MIN         REAL_MIN
   #define DBL_MIN_10_EXP  REAL_MIN_10_EXP
   #define DBL_MIN_EXP     REAL_MIN_EXP
   #define DBL_RADIX       REAL_RADIX
   #define DBL_ROUNDS      REAL_ROUNDS
   #define DBL_SQRT_MIN    REAL_SQRT_MIN
   #define FLT_DIG         REAL_DIG
   #define FLT_EPSILON     REAL_EPSILON
   #define FLT_MANT_DIG    REAL_MANT_DIG
   #define FLT_MAX         REAL_MAX
   #define FLT_MAX_10_EXP  REAL_MAX_10_EXP
   #define FLT_MAX_EXP     REAL_MAX_EXP
   #define FLT_MIN         REAL_MIN
   #define FLT_MIN_10_EXP  REAL_MIN_10_EXP
   #define FLT_MIN_EXP     REAL_MIN_EXP
   #define FLT_RADIX       REAL_RADIX
   #define FLT_ROUNDS      REAL_ROUNDS
   #define FLT_SQRT_MIN    REAL_SQRT_MIN
    #define float real_t
   #define double real_t
   #endif   /* #if !defined ( REAL_MP ) */

   /* math.h functions */
   #define acos acosl
   #define asin asinl

   #define atan atanl
   #define atan2 atan2l
   #define ceil ceill
   #define cos cosl
   #define cosh coshl
   #define exp expl
   #define fabs fabsl
   #define floor floorl
   #define fmod fmodl
   #define sin sinl
   #define sinh sinhl
   #define tan tanl
   #define tanh tanhl
   #if defined ( REAL_SAFE )
   #define atof( x ) \
        ( ((x)  == NULL ) ? \
        RC( 0.0 ) : (float)atofl( (x) ) )
   #define log( x ) \
        ( ( fabsl( (x) ) ) < REAL_MIN ? \
        logl( REAL_MIN ) : logl( fabsl( (x) ) ) )
   #define log10( x ) \
        ( ( fabsl( x ) ) < REAL_MIN ? \
        log10l( REAL_MIN ) : log10l( fabs( x ) ) )
   #define pow( x, y ) \
        ( fabsl( (y) ) < REAL_MIN ? \
        RC( 1.0 ) : \
        ( fabsl( (x) ) < REAL_MIN ? \
        ( (y) < RC( 0.0 ) ) : \
        REAL_MAX : RC( 0.0 ) ) : \
        (float)powl( (x), (y) ) ) )
   #define sqrt( x ) \
        ( fabsl( (x) ) < REAL_SQRT_MIN ? \
        RC( 0.0 ) : sqrtl( fabsl( (x) ) ) )
   #else
   #define atof atofl
   #define log logl
   #define log10 log101
   #define pow powl
   #define sqrt sqrtl
   #endif   /* #if defined ( REAL_SAFE ) */

#else    /* REAL_DBL is default */
   /* Type and constant macro for double */
   typedef double real_t;
   #define RC( x ) ( x )

   /* float.h constants */
   #define REAL_DIG        DBL_DIG
   #define REAL_EPSILON    DBL_EPSILON
   #define REAL_MANT_DIG   DBL_MANT_DIG
   #define REAL_MAX        DBL_MAX
   #define REAL_MAX_10_EXP DBL_MAX_10_EXP
   #define REAL_MAX_EXP    DBL_MAX_EXP
   #define REAL_MIN        DBL_MIN
   #define REAL_MIN_10_EXP DBL_MIN_10_EXP
   #define REAL_MIN_EXP    DBL_MIN_EXP
   #define REAL_RADIX      DBL_RADIX
   #define REAL_ROUNDS     DBL_ROUNDS
   #define REAL_SQRT_MIN   DBL_SQRT_MIN
   #if !defined ( REAL_MP )
   #undef FLT_DIG
   #undef FLT_EPSILON
   #undef FLT_MANT_DIG
   #undef FLT_MAX
   #undef FLT_MAX_10_EXP
   #undef FLT_MAX_EXP
   #undef FLT_MIN
   #undef FLT_MIN_10_EXP
   #undef FLT_MIN_EXP
   #undef FLT_RADIX
   #undef FLT_ROUNDS
   #undef FLT_SQRT_MIN
   #undef LDBL_DIG
   #undef LDBL_EPSILON
   #undef LDBL_MANT_DIG
   #undef LDBL_MAX
   #undef LDBL_MAX_10_EXP
   #undef LDBL_MAX_EXP
   #undef LDBL_MIN
   #undef LDBL_MIN_10_EXP
   #undef LDBL_MIN_EXP
   #undef LDBL_RADIX
   #undef LDBL_ROUNDS
   #undef LDBL_SQRT_MIN
   #define FLT_DIG         REAL_DIG
   #define FLT_EPSILON     REAL_EPSILON
   #define FLT_MANT_DIG    REAL_MANT_DIG
   #define FLT_MAX         REAL_MAX
   #define FLT_MAX_10_EXP  REAL_MAX_10_EXP
   #define FLT_MAX_EXP     REAL_MAX_EXP
   #define FLT_MIN         REAL_MIN
   #define FLT_MIN_10_EXP  REAL_MIN_10_EXP
   #define FLT_MIN_EXP     REAL_MIN_EXP
   #define FLT_RADIX       REAL_RADIX
   #define FLT_ROUNDS      REAL_ROUNDS
   #define LDBL_DIG        REAL_DIG
   #define LDBL_EPSILON    REAL_EPSILON
   #define LDBL_MANT_DIG   REAL_MANT_DIG
   #define LDBL_MAX        REAL_MAX
   #define LDBL_MAX_10_EXP REAL_MAX_10_EXP
   #define LDBL_MAX_EXP    REAL_MAX_EXP
   #define LDBL_MIN        REAL_MIN
   #define LDBL_MIN_10_EXP REAL_MIN_10_EXP

   #define LDBL_MIN_EXP    REAL_MIN_EXP
   #define LDBL_RADIX      REAL_RADIX
   #define LDBL_ROUNDS     REAL_ROUNDS
   typedef long double long_double;
   #define long_double real_t
   #define float real_t
   #endif   /* #if !defined ( REAL_MP ) */

   /* math.h functions */
   #if defined ( REAL_SAFE )
   #define atof( x ) \
        ( ( (x) == NULL ) ? \
        RC( 0.0 ) : (float)atof( (x) ) )
   #define log( x ) \
        ( ( fabs( (x) ) ) < REAL_MIN ? \
        log( REAL_MIN ) : log( fabs( (x) ) ) )
   #define log10( x ) \
        ( ( fabs( x ) ) < REAL_MIN ? \
        log10( REAL_MIN ) : log10( fabs( x ) ) )
   #define pow( x, y ) \
        ( fabs( (y) ) < REAL_MIN ? \
        RC( 1.0 ) : \
        ( fabs( (x) ) < REAL_MIN ? \
        ( (y) < RC( 0.0 ) ? \
        REAL_MAX : RC( 0.0 ) ) : \
        (float)pow( (x), (y) ) ) )
   #define sqrt( x ) \
        ( fabs( (x) ) < REAL_SQRT_MIN ? \
        RC( 0.0 ) : sqrtl( fabs( (x) ) ) )
   #endif   /* #if defined ( REAL_SAFE ) */

#endif   /* #if defined ( REAL_LDBL ) */
#endif   /* #if defined ( REAL_FLT ) */

/* kludge for long double */
#if defined ( REAL_NL ) && !defined ( REAL_MP )
/define long
#endif

#endif   /* #if defined ( REAL_T_INCLUDED ) */

/* End of File */

July 1993/Varying Floating-Point Precision/Listing 2

Listing 2 REAL_L2.C — demonstrate the effects of different precision. This program emphasizes simple operations and assignments.

/* REAL_L2.C */

#include <math.h>
#include <real_t.h>

void main( void )
   {

   int i;
   real_t x, error;

   x = RC( 1.0 );
   error = RC( 0.0 );
   for ( i = 0; i < 30000; i++ )
      {
      error += fabs( x - RC( 0.1 ) * x * RC( 10.0 ) );
      x++;
      }
   error = error;

   }   /* function main */

/* End of File */

July 1993/Varying Floating-Point Precision/Listing 3

Listing 3 REAL_L3.C — demonstrates the effects of different precision. This code uses the native type double long for speed.

/* REAL_L3.C */

#include <math.h>
#include <real_t.h>

void main( void )
   {

   int i;
   real_t x, error;

   x = RC( 1.0 );
   error = RC( 0.0 );
   for ( i = 0; i < 30000; i++ )
      {
      error += fabs( x - sqrt( x ) * sqrt( x ) );
      x++;
      }
   error = error;

   }  /* function main */

/* End of File */

July 1993/Varying Floating-Point Precision

Varying Floating-Point Precision

William Smith

William Smith is the engineering manager at Montana Software, a software development company specializing in custom applications for MS-DOS and Windows. You may contact him by mail at P.O. Box 663, Bozeman, MT 59771-0663.

Most C compilers support two different floating-point (real number) representations, the types loat and double. Some compilers, such as Microsoft C/C++ (MSC) and Borland C++ (BC), support a third type long double.

The choice of which floating-point type best matches an application's requirements for real number representation frequently confronts the programmer. Traditionally, the most popular choice and default floating-point type has been the type double. This type is not necessarily the best choice, however. An embedded application may be space limited, putting a premium on the small size offered by the loat type. Another application may require extremely high accuracy, putting a premium on the high precision sometimes offered by the long double type.

The three different real number types represent a broad choice. There are significant tradeoffs between them. The differences in size are obvious. On a PC, the loat type is four bytes, double is eight bytes, and long double is ten bytes. For situations where a program stores large amounts of data in memory as arrays or matrices, or on disk in binary files, the space savings of using float can be substantial.

Determining the effects on accuracy caused by the differences in precision between the three types is less obvious. Testing of an actual application helps expose differences in accuracy. Table 1 lists the sizes and precision of the floating-point types, in terms of significant decimal digits and a factor called epsilon. Epsilon is the smallest number that, when added to the value 1.0, creates a new number that is greater than 1.0.

High precision does not guarantee high accuracy or correctness. Accuracy is not the same as precision. A type has fixed precision, but each computed value can have different accuracy. Accuracy depends not only on precision but on what a program does with a particular value of a floating-point type.

Besides size and accuracy, the floating-point types also differ in how they affect the performance of a program. The effects on speed are even less obvious than accuracy, and usually require some testing.

The ideal way to measure the differences between the three types is to create different precision versions of the same program. With a small amount of effort, you can write your application, or modify an existing application, so the floating-point type is changeable at compile time. What better way to determine the effects on an application of using different floating-point types then by exercising different precision versions of the same program?

This article presents some techniques you can employ to change the choice of floating-point types at compile time. A single include file, real_t.h, shown in Listing 1, does all the needed preprocessor gymnastics to make different precision versions of the same application.

The techniques employed by real_t.h are limited to choosing among the standard predefined types. If you need a different type than the standard ones, my suggestion is to create new types using C+ +. With some effort, you can create anything from a two-byte real to a BCD (binary-coded decimal) math package. Creating a new type using C+ + affords complete control over the size and the implementation details of the math it employs.

Redefining Types

There are two approaches to creating different precision versions of the same program. The first is to use the preprocessor and redefine the standard types. To create a single precision version of an existing program that uses double, just define double to be float:

#define double float

The purist may consider this an abusive preprocessor tactic, but it works. For starters, you can use this method to convert a program from one type to another. The only limitation is that you do not have complete control if you desire to mix more than one type in the same module.

Another way is to use a generic floating-point type that is set at compile time:

typedef float real_t;

This method is much cleaner, but it requires you to use this generic type instead of the standard types. If you do use the generic type, it affords greater control and flexibility than just redefining the standard types. The most obvious advantage of the generic type is that you can mix types in a single module. Redefining all types to a common type prevents using mixed precision.

The include file real_t.h supports a combination of both approaches. It creates a generic type, real_t. It will also redefine float, double, and long double to real_t unless you specify that your code uses more than one type (mixed precision). For example, when setting the generic floating-point type to float, real_t.h does the following:

typedef float real_t;
#define double real_t;
typedef long double long_double;
#define long_double real_t;
#define long

Note the creation of the type long_double and the defining of long to nothing. This is a kludge to overcome the inability to define long double to real_t directly. Unfortunately C does not allow you to define a combination of tokens. You can not simply define long double to something different. You have to define long and double separately. Defining long to nothing has side effects with other non-floating-point types. I am open to suggestions of possible solutions to this problem with long double.

Effects on <math.h>

Redefining floating-point types has repercussions on the standard math function prototypes in <math.h>. Using a generic type also impacts these standard math library functions. Changing floating-point types generally requires macro wrappers for all the math functions in <math.h>. real_t.h, takes care of these problems by creating macros for all the floating-point functions. The macro wrappers ensure that a program using a math function calls the right function. In the case where the same function supports more than one precision, the macros in real_t.h cast the return value and parameters correctly.

In addition to ensuring that the proper precision version of a particular function is called, macros wrappers for all the functions in <math.h> yield an opportunity for embedding debugging code in an application. (See "Debugging with Macro Wrappers", CUJ, October 1992.) Pinpointing what line of source code caused a floating-point exception is easy by using the predefined macros __FILE__ and __LINE__, along with parameter checking of the standard math functions.

You can also make safe versions of the standard math functions by checking for out-of-range parameters. One fixup for such a parameter is to substitute a default result that is in range. This will hide the problem, of course. But you can also trap the error and report it, then use such a substitution to recover gracefully.

Effects on <float.h>

Just as the function prototypes in <math.h> needed some adjustment, the constants defined in <float.h> also need adjustment to account for different precision. real_t.h defines a new set of constants that begin with REAL_. You use these constants with the generic type real_t. real_t.h sets these constants to the appropriate values of FLT_, DBL_, or LDBL_, depending on the desired precision. real t.h also redefines all the FLT_, DBL_, and LDBL_ constants if you are not using mixed precision.

Manifest Constants

When dealing with different floating-point types, there are even caveats concerning constant data and how to declare constants, so you can modify the precision at which the compiler stores constants in a program.

To prevent conversion and possible loss of accuracy, the precision of manifest constants should match the precision of any floating-point variables that use them. Casting constants upon assignment may eliminate a compiler's warning that a conversion is taking place, but it does not ensure the program stored the constant in the program file the way you may desire. For example,

long double x = (long double)0.1;

is not the same as nor as accurate as:

long double x = 0.1L;

To ensure that a compiler stores constants in the same precision as the floating-point type, real_t.h uses a string-pasting macro. The version for storing manifest constants as floats is:

#define RC( x ) ( x##F )

And the version for storing manifest constants as long doubles is:

#define RC( x ) ( x##L )

The macro RC(x) provides a mechanism for adjusting the precision of constants as you change the type. real_t.h defines the macro RC to match the desired floating-point type. The only limitation to RC (actually just an inconvenience) is that you have to use the macro.

In a similar fashion, you may wish to define a format code for the printf family of functions. This format code would change based upon the precision of the generic type real_t.

Parameter Passing

A compiler may employ different mechanisms for passing and returning floating-point values from functions, possibly depending on compile-time options and data sizes. In the case of MSC, the compiler uses a global location called the floating point accumulator (_fac) for functions that return either floats or doubles. MSC uses the numeric data processor (NDP) stack for functions that return long double. BC uses the NDP stack regardless of the type of floating-point number. MSC further complicates matters by employing a hidden pointer parameter for functions that you declare with the _pascal calling convention instead of _cdecl.

Most of the time this behavior does not matter and is hidden from the programmer. About the only time this will become a concern is when you are creating a dynamic link library (DLL) for Microsoft Windows and are exporting functions from the DLL that return floating-point types. Since DLLs have a different data segment than the calling application, but share the same stack, there can be a bit of confusion. It can be hard to guess, for example, what segment the hidden pointers point to, or contains the floating point accumulator, or contains the NDP stack. The best way to avoid problems for Windows DLLs that export functions returning floating-point types is to use _pascal calling convention and the large memory model.

Using real_t.h

To use real_t.h with an existing program, just add the line:

#include "real_t.h"

You should include real_t.h after you list all the standard library headers that a module needs. If your program makes any calls to the standard library math functions declared in <math.h>, the program must include <math.h> before real_t.h. If your program uses any of the manifest constants defined in <float.h>, the program must include <float.h> before real_t.h.

real_t.h reacts to and modifies its behavior in response to six different constants. You can define these constants on the compiler command line or inside your program. Either way, you must define them before you program includes the file real_t.h. The first three are REAL_????, where ???? is either FLT, DBL, or LDBL. These three constants control what the default floating-point precision is. REAL_FLT is float, REAL_DBL is double, and REAL_LDBL is long double. If an application does not define one of these three constants and still includes real_t.h, REAL_DBL is assumed. This results in designating double as the default floating-point type real_t.

The constant REAL_MP determines if existing standard types get redefined. If you define REAL_MP, your program desires mixed precision. The existing standard types do not get redefined to the desired floating-point type. Defining REAL_MP also prevents the constants in float.h from getting redefined.

The constant REAL_NL controls whether the standard keyword long gets redefined. To convert the long double type to another type requires defining long to nothing. If you define REAL_NL and do not define REAL_MP, then long gets defined to nothing.

/* kludge for long double */
#if defined ( REAL_NL ) && !defined ( REAL_MP )
#define long
#endif

The constant REAL_SAFE causes real_t.h to wrap the log, log10, pow, and sqrt functions in safe parameter-checking macros. This demonstrates how to implement a safe version of some (not all) of the math functions.

Comparing Precisions

There are two test programs (REAL_L2.C in Listing 2 and REAL_L3.C in Listing 3) that demonstrate the effects of different precision. They give some "real world" indications of the effects that floating-point precision has on speed and accuracy. The size differences between the three different floating-point types are obvious, so the programs do not demonstrate differences in size.

Table 2 summarizes the results of running Listing 2 for float, double, and long double types. Table 3 lists similar results for running Listing 3.

I compiled the programs using MSC with the compiler options

/FPi87 /0d /DREAL_????

The first option forces the use of a math co-processor. The second prevents code optimization. The third defines either the constant REAL_FLT, REAL_DBL, or REAL_LDBL. This is how you adjust the precision at compile time. You will probably see different results using a different compiler or different compiler options, such as for a co-processor emulator version. Although interesting, analyzing all the possible combinations would soon balloon into a project larger than I can concisely summarize in a single article.

For both Listing 2 and Listing 3, the error value represents the accumulated error due to lack of accuracy. Ideally, the error should be 0.0. For simplicity and to facilitate timing, I obtained the value of error using a debugger instead of using printf.

The time is in seconds. Times are for testing on a 20MHz 386 with a math co-processor. Smaller numbers represent faster better performance.

Speed

The speed differences are interesting. The results of testing Listing 2 indicates the performance degrades as the precision increases. Listing 3 results shows the opposite — the performance increases as the precision increases. This may seem like conflicting data, but I came up with a explanation that makes sense. At least to me.

My interpretation of the results is that the larger the size the more overhead in basic assignment operations. Listing 2 emphasizes simple operations and asassignments. The float version is the fastest because floats are smaller, resulting in less physical memory being pushed around.

In the case of Listing 3, the long double version is the fastest. This is due to the fact that long double is the native type for the math co-processor. Using the native long double when calling the math functions eliminates the need to convert from one type to another. Type float is the slowest because the program must not only repeatedly convert from long double to double but also from double to float. Consequently, a program that makes a lot of use of the math functions in <math.h> may see some speed improvement with the long double type.

I base these conclusions solely upon the data generated by Listing 2 and Listing 3. To get a clear perspective on how a specific compiler treats different types would entail disassembling the compiled code.

Accuracy

Accuracy improves with precision. This is no surprise. In the case of Listing 2, the difference in magnitude between the error for float versus double is about the same as the difference between FLT_EPSILON and DBL_EPSILON. This means that FLT_EPSILON divided by DBL_EPSILON is about the same value as the error for the float version divided by the error for the double version.

The same is true for the long double type compared to the double or the float. The difference in magnitude is about the same as the difference in magnitude between LDBL_EPSILON and DBL_EPSILON or FLT_EPSILON. In the case of Listing 2, accuracy is directly related to precision.

This relationship does not hold true for Listing 3. Listing 3 tests the accuracy of one of the math functions, sqrt. I am tempted to conclude that the loss of accuracy in Listing 3 occurs mainly in the sqrt function and in any type conversion in calling the sqrt function. Hence, the differences between the error results are due to the differences between the two functions sqrt and sqrtl. Since there is not a float version of the sqrt function, both the float and the double version use the same function, sqrt. It is not surprising then that the corresponding versions generated the same amount of error.

The long double type has its own function sqrtl. It would be interesting, from the standpoint of both accuracy and speed, to see the effect of using the long version with the other precisions. In the case of Listing 3, accuracy did not improve when going from type float to the higher precision type double. It did improve by going to the long double type.

Conclusions

Real numbers in the world of computers are an oxymoron. The type double can represent roughly 264 different real numbers. These numbers are spread out between the limits — DBL_MIN and DBL_MAX. This is rather limited. By definition, there are actually an infinite number of different numbers between any two different real numbers you can name — even if they are very close together. Likewise, a binary representation of some real numbers, (0.1, 1/3 etc.) is truly just an approximation. So by definition, working with real numbers in the binary world is limited in accuracy.

Fortunately, imperfect accuracy is "good enough" most of the time. And for situations that require varying "goodness," there are three different floating-point types in C that one can use. Deciding which type is the best match for a specific application requires an assessment based upon the size, speed, and accuracy needs of the application.

The best way to determine what type is best is to test different precision versions of an application. The include file real_t.h provides the tools to easily switch floating-point types at compile time. It uses some aggressive and uncommon macro and preprocessor tactics, but it can even help existing applications change precision at compile time.

So instead of just using the default type double, try experimenting your code with the three different floating-point representations that most modern C and C++ compilers offer. There are usually advantages of one type over the others when addressing specific applications needs

July 1993/Varying Floating-Point Precision/Table 1

Table 1 Comparison of floating-point sizes and precisions (on a PC)

Type         Size (bytes)  Significant  Epsilon
                           Digits
------------------------------------------------------------------
float        4             7            1.192092896e-07
double       8             15           2.2204460492503131e-016
long double  10            19           5.4210108624275221706e-020

July 1993/Varying Floating-Point Precision/Table 2

Table 2 Comparison of accuracy performance for different versions of Listing 2

Type         Error         Time
-------------------------------
float        6.70556       1.16
double        2.49808e-008  1.21
long double  4.83773e-012  1.48

July 1993/Varying Floating-Point Precision/Table 3

Table 3 Comparison of accuracy and performance for different versions of Listing 3

Type         Error         Time
-------------------------------
float        3.85273e-008  3.85
double       3.85272e-008  3.46
long double  1.80687e-011  3.35