The *SeqN functions are partially sequential, they synchronize after every N elements. To use the Mandelbrot example, if you're rendering a 64000 x 64000 image, it would take more than 4 gigs of RAM to do it with pbinit. For a sequential renderer that is no problem, since you can output each byte after it's computed. A parallel renderer can't really use that strategy, since the pixels are computed in parallel and outputting them as they come would lead to an unsorted image.

Sequential output:

# iter (fun i -> sleep (3-i); puts (showInt i)) (0--3);;

0

1

2

3

- : unit = ()

Parallel output:

# piter ~process_count:4 (fun i -> sleep (3-i); puts (showInt i)) (0--3);;

3

2

1

0

- : unit = ()

What we can do is turn the problem into a sequential iteration of internally parallel blocks. E.g. render a 1MB block of pixels in parallel, print it out before doing the next block. This way memory use is limited to block size and we still get a decent parallelization speedup.

Partially sequential iteration:

# iter (fun l -> iter puts @@ pmap (fun i -> sleep (3-i); showInt i) l) (groupsOf 2 (0--3));;

0

1

2

3

Or using piterSeqN:

# piterSeqN 2 puts (fun i -> sleep (3-i); showInt i) (0--3);;

0

1

2

3

Here's a larger example, using the already familiar Mandelbrot renderer. Starting with a sequential version:

open Prelude

let niter = 64

let limit = 4.0

let mandelbrot z x y =

let cr = 2.0 *. float x /. z -. 1.5 in

let ci = 2.0 *. float y /. z -. 1.0 in

let rec aux i zr zi = if i >= niter then 0 else

let nzr = zr *. zr -. zi *. zi +. cr

and nzi = 2.0 *. zr *. zi +. ci in

if nzr *. nzr +. nzi *. nzi > limit

then int (ceil (255. *. (float (niter-i) /. float niter)))

else aux (i+1) nzr nzi in

aux 0 0.0 0.0

(*vvv Interesting part starts here vvv*)

let draw_fractal z w h =

for y = 0 to h-1 do

for x = 0 to w-1 do

output_byte stdout (mandelbrot z x y)

done

done

let print_fractal w h =

puts (sprintf "P5 %i %i 255" w h);

draw_fractal (float (min w h)) w h

let () = print_fractal (parseInt Sys.argv.(1)) (parseInt Sys.argv.(1))

To make it parallel, we make draw_fractal create all scanlines and print them out:

let draw_scanline z w y =

binit (fun x -> mandelbrot z x y) w

let draw_fractal z w h =

let scanlines = pmap (fun y -> draw_scanline z w y) (0--(h-1)) in

iter putStr scanlines

And to make it constant-space parallel (well, O(width*256)), let's process in blocks of 256 scanlines:

let draw_fractal z w h =

piterSeqN 256 putStr (fun y -> draw_scanline z w y) (0--(h-1))

Done! Let's put it all together one more time:

open Prelude

let niter = 64

let limit = 4.0

let mandelbrot z x y =

let cr = 2.0 *. float x /. z -. 1.5 in

let ci = 2.0 *. float y /. z -. 1.0 in

let rec aux i zr zi = if i >= niter then 0 else

let nzr = zr *. zr -. zi *. zi +. cr

and nzi = 2.0 *. zr *. zi +. ci in

if nzr *. nzr +. nzi *. nzi > limit

then int (ceil (255. *. (float (niter-i) /. float niter)))

else aux (i+1) nzr nzi in

aux 0 0.0 0.0

let draw_scanline z w y =

binit (fun x -> mandelbrot z x y) w

let draw_fractal z w h =

piterSeqN 256 putStr (fun y -> draw_scanline z w y) (0--(h-1))

let print_fractal w h =

puts (sprintf "P5 %i %i 255" w h);

draw_fractal (float (min w h)) w h

let () = print_fractal (parseInt Sys.argv.(1)) (parseInt Sys.argv.(1))

We replaced five lines of sequential code with three lines of parallel code and got a version that scales nearly linearly with the number of cores. Not too bad, eh?