Continuing from yesterday, landed a bunch of new parallel combinators in prelude.ml along with a small smattering of convenience functions for working with arrays and strings.

The following version of the parallel mandelbrot drawer demonstrates byte strings:

open Prelude

let mandelbrot w h x y =

let cr = 2.0 *. float x /. float w -. 1.0 in

let ci = 2.0 *. float y /. float h -. 1.5 in

let rec aux i zr zi = if i >= niter then 0 else

let nzr = zr *. zr -. zi *. zi +. cr

and nzi = 2.0 *. zr *. zi +. ci in

if nzr *. nzr +. nzi *. nzi > limit then 255-i

else aux (i+1) nzr nzi in

aux 0 0.0 0.0

(* pbinit f = psinit (fun i -> char (f i)) *)

let draw_fractal xoff yoff w h =

pbinit (fun i ->

let y,x = quot_rem i w in

mandelbrot w h (x+xoff) (y+yoff)

) (w*h)

(* blend the two images by averaging the values at each pixel *)

(* pbZipWith f = psZipWith (fun x y -> char (f (ord x) (ord y))) *)

let blend a b = pbZipWith average2

let blended_fractal d =

let a = draw_fractal 0 0 d d in

let b = draw_fractal (d/4) (-d/4) d d in

blend a b

let square_fractal d =

let header = sprintf "P5 %i %i 255\n" d d in

header ^ (blended_fractal d)

And we can further refactor draw_fractal into the following:

let pbinit2D f w h = pbinit (fun i -> let y,x = quot_rem i w in f x y) (w*h)

let draw_fractal w h = pbinit2D (mandelbrot w h) w h

We could also write the drawer in terms of arrays instead of strings:

let painit2D f w h = painit (fun i -> let y,x = quot_rem i w in f x y) (w*h)

let draw_fractal w h = painit2D (mandelbrot w h) w h

let blend a b = paZipWith (chr @.. average2)

let blended_fractal d =

let a = draw_fractal 0 0 d d in

let b = draw_fractal (d/4) (-d/4) d d in

implodeArray (blend a b)

But that's not a very good idea since ints are 4-8 times wider than chars, further lowering our ratio of computation to memory accesses.

One thing I found out in writing the blend was that it doesn't really get any noticeable speed-up from using the parallel zip vs. normal zip. I suppose it's because the copying overhead is so large compared to the very trivial arithmetic: if the copy takes 4 cycles per byte and the iteration takes 3 cycles per byte, doing it with a single thread would take 3 cycles per byte, while doing it with two threads would take 1.5+4= 5.5 cycles per byte (assuming linear speed boost in iteration.)

By comparison, the mandelbrot function is very multicore-happy since it's self-contained and does no memory reads. As all it does is run a few hundred cycles worth of math in the CPU registers for every byte that it writes to memory, it scales in a very linear fashion.