art with code

2010-12-19

WebGL Orrery


I wrote a small WebGL orrery a month back or so, using planet and orbit data from Wikipedia. The orbits came out a bit wrong (planet positions on orbits are wrong, the planet rotation axis rotates along the orbit, planets velocities are constant, the orbits / axes may well be flipped somewhere), but it's a passably "realistic" orrery. Well, realistic apart from the whole having to scale down orbits by a factor of thousand to make the planets visible -thing. With 1:1 orbits, the planets are smaller than a pixel in size when viewed from other planets and the sun is about a pixel in diameter when viewed from Neptune.

It's quite extensible too, it should be possible to add in all the moons just by filling in their data. The coordinate system and z-buffer aren't accurate enough to do anything fancy like a spaceship sim with 1:1 orbits though :)

2010-12-01

A deck of cards

If not for the order property, you could cram a deck of cards into a 64-bit int. Though if you had a RNG cyclical in full_deck_size, that'd work to preserve the order property. Which would be enough for poker and other stackless or write-only stack card games.

With the cyclical RNG you could pop a card off a full deck in O(1). With incomplete decks (that is, a deck with non-RNG-generated cards removed) you'd need to go through random cards until the generated card is in the deck. The worst-case time-complexity for popping all cards off an incomplete deck would be O(full_deck_size) and the average complexity for popping one card off an incomplete deck would be O(full_deck_size/incomplete_deck_size). Anyhow, incomplete decks would be as fast or slow as full decks, with an extra existence check slowing them down a bit further. The RNG would need a seed with log2(fac(full_deck_size)) bits, so log2(52!) = 226 bits for a deck without jokers.

A naive version of the RNG would be to generate a random number between 0 and 52!-1, then convert the number into 52 digits in factorial base. Divide first by 51!, the result is the first card. Divide the remainder by 50!, the result is the second card (note that you have to remove the first card from consideration for the second card index). Continue dividing the remainders until you have 52 cards. Or just generate a shuffled 52 element array of numbers and go through that :P

... I should get back to work now.

Source

#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

typedef int64_t card_mask;
typedef int8_t card;

typedef struct deck {
card cards[55];
int8_t length;
card_mask mask;
} deck_t;

#define suit_size 13

static const card_mask hearts = suit_size*0;
static const card_mask spades = suit_size*1;
static const card_mask clubs = suit_size*2;
static const card_mask diamonds = suit_size*3;
static const card_mask jokers = suit_size*4;

static const char* suits[5] = {"Hearts", "Spades", "Clubs", "Diamonds", "Jokers"};
static const char* ranks[13] = {
"Ace", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten",
"Jack", "Queen", "King"
};

card_mask remove_cards_from_mask(card_mask deck, card_mask cards)
{
return deck & ~cards;
}

card_mask add_cards_to_mask(card_mask deck, card_mask cards)
{
return deck | cards;
}

card_mask query_mask(card_mask deck, card_mask cards)
{
return deck & cards;
}

card_mask mask_of_card(card c)
{
return (card_mask)1 << (card_mask)c;
}

card card_for_suit_and_rank(card_mask suit, card_mask rank)
{
return suit + rank;
}

int deck_initialize(deck_t *deck, int jokers)
{
card c = 0;
if (jokers < 0 || jokers > 3)
return 0;
deck->length = 52 + jokers;
deck->mask = ((card_mask)1 << (card_mask)deck->length)-1;
for (c=0; c<deck->length; c++) {
deck->cards[c] = c;
}
return 1;
}

void deck_clear(deck_t *deck)
{
deck->length = 0;
deck->mask = 0;
}

void deck_shuffle(deck_t *deck)
{
int8_t i,j;
card tmp;
for (i=deck->length-1; i>0; i--) {
j = rand() % (i+1);
tmp = deck->cards[j];
deck->cards[j] = deck->cards[i];
deck->cards[i] = tmp;
}
}

card_mask deck_query_mask(deck_t *deck, card_mask m)
{
return query_mask(deck->mask, m);
}

card_mask deck_query_card(deck_t *deck, card c)
{
return deck_query_mask(deck, mask_of_card(c));
}

card_mask deck_query_deck(deck_t *deck, deck_t *deck2)
{
return deck_query_mask(deck, deck2->mask);
}

int deck_add(deck_t *deck, card c)
{
card_mask m = mask_of_card(c);
if (query_mask(deck->mask, m)) {
return 0;
} else {
deck->mask = add_cards_to_mask(deck->mask, m);
deck->cards[deck->length] = c;
deck->length++;
return 1;
}
}

int deck_remove(deck_t *deck, card c)
{
int8_t i=0, j=0;
card_mask m = mask_of_card(c);
if (!query_mask(deck->mask, m)) {
return 0;
} else {
deck->mask = remove_cards_from_mask(deck->mask, m);
for (i=0,j=0; i<deck->length; i++) {
if (deck->cards[i] != c) {
deck->cards[j] = deck->cards[i];
j++;
}
}
deck->length = j;
return 1;
}
}

card deck_pop(deck_t *deck)
{
card c;
if (deck->length < 1) {
return -1;
} else {
deck->length--;
c = deck->cards[deck->length];
deck->mask = remove_cards_from_mask(deck->mask, mask_of_card(c));
return c;
}
}

const char* suit_string(card s)
{
if (s >= suit_size*5)
return "Invalid";
return suits[s / suit_size];
}

const char* rank_string(card s)
{
return ranks[s % suit_size];
}

void print_card(card c)
{
printf("%s of %s\n", rank_string(c), suit_string(c));
}

void print_deck(deck_t *deck)
{
int8_t i;
printf("Deck %p:\n", (void*)deck);
for (i=0; i<deck->length; i++) {
print_card(deck->cards[i]);
}
}

void print_mask(card_mask m)
{
int i;
const char* symbols[5] = {"♥", "♠", "♣", "♦", "☻"};
for (i=0; i<64; i++) {
if (i%suit_size == 0) printf(" %s", symbols[i/suit_size]);
if ((m>>i) & 1) putchar('#');
else putchar('.');
}
putchar('\n');
}

int main()
{
int i=0;
deck_t deck, hand;
card c;
srand(time(NULL));
deck_initialize(&deck,2);
deck_initialize(&hand,0);
deck_clear(&hand);
deck_shuffle(&deck);
printf("Deck %p has %d cards and mask %ld.\n", (void*)&deck, deck.length, deck.mask);
print_mask(deck.mask);
for (c=0; c<54; c++) {
if (!deck_query_card(&deck, c)) {
printf("Error: Deck doesn't have a card.\n");
print_card(c);
}
}
printf("Hand %p has %d cards and mask %ld.\n", (void*)&hand, hand.length, hand.mask);
print_mask(hand.mask);
for (c=0; c<54; c++) {
if (deck_query_card(&hand, c)) {
printf("Error: Hand has a card it shouldn't have.\n");
print_card(c);
}
}
deck_remove(&deck, hearts);
if (deck_query_card(&hand, hearts))
{ printf("Error: Hand has a card it shouldn't have.\n"); print_card(hearts); }
deck_remove(&deck, spades);
if (deck_query_card(&hand, spades))
{ printf("Error: Hand has a card it shouldn't have.\n"); print_card(spades); }
deck_remove(&deck, clubs);
if (deck_query_card(&hand, clubs))
{ printf("Error: Hand has a card it shouldn't have.\n"); print_card(clubs); }
deck_remove(&deck, diamonds);
if (deck_query_card(&hand, diamonds))
{ printf("Error: Hand has a card it shouldn't have.\n"); print_card(diamonds); }
printf("Deck %p has %d cards and mask %ld.\n", (void*)&deck, deck.length, deck.mask);
print_mask(deck.mask);
for (i=0; i<5; i++) {
c = deck_pop(&deck);
print_card(c);
deck_add(&hand, c);
}
printf("Deck %p has %d cards and mask %ld.\n", (void*)&deck, deck.length, deck.mask);
print_mask(deck.mask);
printf("Hand %p has %d cards and mask %ld.\n", (void*)&hand, hand.length, hand.mask);
print_mask(hand.mask);
print_deck(&hand);
if (deck_query_deck(&deck, &hand) || deck_query_deck(&hand, &deck)) {
printf("Error: Deck and Hand intersect.\n");
} else {
printf("Deck and Hand are disjoint.\n");
}
for (i=0; i<hand.length; i++) {
if (deck_query_card(&deck, hand.cards[i])) {
printf("ERROR ");
} else {
printf("OK ");
}
}
printf("\n");
for (i=0; i<deck.length; i++) {
if (deck_query_card(&hand, deck.cards[i])) {
printf("Error: Found a card in Deck that is in Hand.\n");
print_card(deck.cards[i]);
}
}
print_card(suit_size*5);
return 0;
}

Output

$ ./a.out
Deck 0x7fff98907250 has 54 cards and mask 18014398509481983.
♥############# ♠############# ♣############# ♦############# ☻##..........
Hand 0x7fff98907210 has 0 cards and mask 0.
♥............. ♠............. ♣............. ♦............. ☻............
Deck 0x7fff98907250 has 50 cards and mask 18013848686551038.
♥.############ ♠.############ ♣.############ ♦.############ ☻##..........
Eight of Hearts
Seven of Clubs
Ten of Diamonds
Six of Diamonds
Nine of Hearts
Deck 0x7fff98907250 has 45 cards and mask 17714777228828286.
♥.######..#### ♠.############ ♣.#####.###### ♦.####.###.### ☻##..........
Hand 0x7fff98907210 has 5 cards and mask 299071457722752.
♥.......##.... ♠............. ♣......#...... ♦.....#...#... ☻............
Deck 0x7fff98907210:
Eight of Hearts
Seven of Clubs
Ten of Diamonds
Six of Diamonds
Nine of Hearts
Deck and Hand are disjoint.
OK OK OK OK OK
Ace of Invalid

2010-11-23

Yet more GREAT ideas

Amateur astronomer telescopes, live video streaming, combine the different observations into high-res frames: live 24/7 HD webcams of the planets.

Speckle imaging
Compressed sensing
Image Deblurring with Blurred/Noisy Image Pairs

More GREAT ideas

Take The European Citizens' Initiative, add a public forum, link it up with EU proceedings, build a Citizens' Parliament.

The ECI proposal says that you need a million signatories to send a citizens' initiative to the European Commission. The EC is the main proposal-generating entity in the EU. The European Parliament, in contrast, is the proposal-filtering entity (now with new proposal-generating powers granted to it by the Lisbon Treaty). In that sense, the ECI impacts the very top of the EU legislative system.

The million signatories must include 0.2% of the voting-age citizens for a third of the member states to be valid. And after collecting 300 000 statements of support, you must submit it to the EC for an admissibility acid check where the EC figures out whether they can legally do it or not. After two months, you should get back a yay/nay. And you can collect the statements of support online as long as your mechanism is secure (further details pending).

After that, 3 months of checking the signatures for validity by the member states using random sampling. After that, 4 months of examination by the EC, at the end of which it "would then be required to set out its conclusions on the initiative and the action it intends to take in a communication, which would be notified to the organiser as well as to the European Parliament and Council and would be made public."

Not the most blisteringly fast method of democratic participation, is it? Still, it's very interesting.

I wonder how much governance is a problem of thinking power (in terms of "need more heads to think better solutions!") and how much a sensor network problem ("need to know what's wrong with the world to fix it!") and how much a problem of integrating the two into a working system. How would this kind of a wide-ranging massively large parliament work best?

A large and diverse population has experts from all the different walks of life. Which is probably where the thinking power of the citizens' parliament would best manifest. But you need to find the best proposals from the noise of mediocre proposals. Then you have the sensor network side, where a large population is again an asset. But again, there's a large amount of noise with a large population. So perhaps the important thing is making a good network to filter the signal.

2010-10-22

Storage visions

Some sort of user-friendly UI to ZFS+Plan9. One big main filesystem that spans several drives and automatically does striping, mirroring and versioning. Fast drives are used as cache, big drives as mirrors and history.

Plug in a new internal drive and usable space, redundancy and total performance grows by some percentage of the new drive's specs. Bring a new computer in the local area network and the filesystems on the computers are merged into a network-wide filesystem, adding usable space, redundancy and performance.

Pull out a drive and the file system rebalances itself over the remaining drives. If some files require the pulled out drive, display notification and freeze the files until the drive is restored. Use a disk utility to separate a drive from the filesystem into a new filesystem, the utility copies files unique to the drive to other drives and disconnects and formats the separated drive when done.

Computers on the local network are used as page cache and as mirrors of the filesystem. Cloud service is used as a backup mirror and remote sync. Treat the entire memory hierarchy and network as parts of the same single filesystem. When on the network, it doesn't matter which computer or which disk has the file, apart from performance.

Transient devices carry a small local cache of the important things on the filesystem, and access the rest of the filesystem over the network. Either by connecting to the home network or by going through the cloud service. The local cache of the filesystem prioritizes metadata over the actual data and access to the actual data can usually be done by streaming a low-bandwidth version.

*mirrorshades*
*techno soundtrack*
*sunrise*

2010-10-20

How many dwarfs live in Mt. Everest

Dwarfs, those earth-dwelling little buggers, grand masters of volume. What if they burrowed the entire Mount Everest into a big dwarf-warren? Let's find out!

First, some assumptions.

Dwarfs are clearly good at cramped living as they live in mostly closed-off subterranean cities. They have also mastered the art of stonecraft to a magical level, so their buildings may well compare favorably to modern ferro-concrete towers. Let's assume that a dwarven city boasts a similar population density as urban Hong Kong, i.e. 250000 dwarfs per square kilometer with an average building height of 45 meters. A cubic kilometer of dwarvopolis would consist of 20 such layers and have a population of 5 million dwarfs.

Mount Everest is kinda pointy, but not all that pointy. Let's suppose that its slope is 45 degrees and that it approximates a cone. That would give it around 720 cubic kilometers in volume. But don't forget that dwarfs are subterranean, so we could reasonably assume that the dwarven city of Sagarmatha extends an extra kilometer under the surface with dwarven suburbia sprawling in a 10 kilometer radius. With the subterranean suburbs added, our dwarven city would have a total volume of around one thousand cubic kilometers.

From which a simple multiplication leads us to a total Sagarmatha dwarvopolitan population of 5 billion.

But why stop at Everest? Cast your net wider! The entire population of the Great Himalayan Dwarven Realm would be... the area of Himalayas is around 610000km^2, Tibetan plateau height is 4.5km, assume 0.5% of Everest population density for the sticks, i.e. 25000 dwarfs per cubic kilometer, and put it all together for a grand total of 69 billion Himalayan dwarfs.

Ubuntu 10.10 nits

Nice fonts.
Nice boot screen.
Nice boot screen doesn't work with fglrx.
Dark theme bg & window style screws up labels with dark text.
Dark title bar & dark menu bar occasionally look silly with the light-grey window bg. Especially Firefox looks bad. The thing with dark bg themes is that the amount of apps designed to work properly with one is damn close to zero. (Hence, I suppose, the light-grey window bg.)
The active window titlebar gradient looks kinda sloppy. Ditto for popup menu gradients.
Flat window buttons walk a tightrope between sucking and being just boring.
GTK widget theme buttons are very nice.
Dropdown widget has the triangle vertically off-center by 1px?
The salmon active color in the widget theme is not very nice.
Scrollbar widget looks different from the rest of the theme and a bit weird. Exacerbated by rounded buttons but no alpha channel, so Firefox textarea scrollbar buttons look icky.
New Rhythmbox icon is nice.
Rhythmbox itself is still nowhere near Amarok 1.4.
Amarok 2.0 is still nowhere near Rhythmbox.
Gnome Terminal icon is vertically off-center in top panel.
Notification area in top panel looks nice with the monochrome theme.
App icons in top panel are too tall and look bad.
Weather applet rainy icon has a black border. Other widget icons (other weather icons too) don't have borders, making the rainy icon look out-of-place.
The speech bubble icon in top panel is one pixel too low (more like one pixel too high).
The networking applet icon is super-confusing. I keep looking at it and wondering "Wtf was that icon for again? Check updates? No.. oh right, networking."
The (very low-visibility) round corners in panel dropdown menus look nice. But suffer from lack of text padding.
Window title color is actually orange instead of dark grey (try Chromium with GTK theme).
1px wide window resize handles make me cry.
The purple color works surprisingly well.
Default desktop bg is nice.
Popup text labels have too little horizontal padding. Otherwise they look nice.
Date & time applet text has too little horizontal padding as well.
The panel applets have crap spacing so you need to manually move them around by a couple pixels to make the layout look less horrible.
PulseAudio is glitchy and the volume control does not work (you can only set it between mute, medium volume and super-loud volume, no low volume setting available).
The new fancy volume control doubling as a music player remote is nice.
udev screws up md RAID arrays by being clueless (starts only one disk of a RAID-1 -> stops boot process thx to unable mount disk & dirty array after reassembly -> need resync -> FFFFUUUUUUU. How about using mdadm -A --scan like sane people do?) [edit: mdadm -A --scan wants to have an md device name like /dev/md0 instead of /dev/md/MyRAID, I dunno if that sunk udev or if it's just doing silly things.]
Nautilus is slow so I very much try not to manage files with it (instead terminal for quick & programmable things, Filezoo for overviews and visual sorting)
The CPU speed indicator applet could show all the CPUs at once instead of requiring running several copies of it.
I don't get what the Social Networks thing does. It doesn't seem to do anything. Maybe that's what it does.
What is Ubuntu One? Perhaps these things should be explained as one starts them up?
Why is this Gwibber thing so slow? Or perhaps it just lacks a progress indicator, making it feel slow.
Evolution feels like a really fiddly and slow interface to Gmail.

I do really like it for the most part, at least after a year in Debian's permabroken Gnome-land. I'd use FVWM or something cool and edgy and configurable like that for window management but compiz no worky on non-retarded WMs and theming FVWM (while infinitely easier than theming metacity) is limited and painful. Not that I've done any theming for a couple years. Plus you don't have a nice GUI to make F1 start a new terminal (just a nice thousand-line config file).

Maybe there should be a WebKit-based desktop environment.

2010-10-18

Fast SSD is fast

The PCI-E SSD I bought is a 120GB OCZ RevoDrive. Which is basically two 60 GB OCZ Vertex 2 SSDs mounted on a PCI-E fakeRAID controller.

Works nicely in Ubuntu 10.10, though apparently less nicely on earlier versions (problems booting from fakeRAID).

The amusing thing about the SSD is that the controller is compressing all the data it writes to the drive. So you get 540MB/s speeds for reading a file full of zeroes but just 300MB/s for reading a file full of MP3. Write speeds are 475MB/s for zeroes, 300MB/s for MP3s.



A random access 4kiB read takes around 0.26 ms, for a 16MB/s random read speed. But you can do around 8 random reads in parallel, which gets you a 117MB/s random read speed. My 7200 RPM disks can do around 75MB/s with a streaming read, so 117MB/s random access speed is absolutely nuts.

Parallel reads are kinda hard to program though. You could spawn a bunch of threads and do a gather into a shared array I suppose, though that feels a bit heavy (hah, right. I expect the threading overhead to be negligible.) Do the reads from a shared mmap of the file, each thread reading its own segments off it? [edit: no, mmap apparently serializes its page faults or OpenMP doesn't run the threads in parallel]

2010-10-10

WebGL presentation editor prototype


Try it out here, requires a WebGL capable browser. It's a bit buggy, so watch your head.

You can check out the sources on GitHub. What's still missing is making the editor work properly without WebGL DONE (though it's not Mobile Safari -friendly), making the slide editing more efficient, and fixing some markup parser bugs fixed some. At that point it'd be a decent barebones slideshow app. Add themes, fancy filters and animations and stir on low heat.

2010-10-04

Desk drawer coding

Tally of different sorts of crap I've done to pass the time:

Music players written: 4 (with mplayer / <audio> / Flash for playback, so maybe these are more "playlist players").
Image slideshows written: 5, three of which on arbitrarily long lists.
Video players /w playlist: 1 (I just repurposed one slideshow).
Document readers: 3, though they just show images of pages.
Scene graph libraries for different retarded purposes: 7.
GUI toolkits: 1. But it didn't go too well.
LaTeX-style text justifier: 1.
Layout text inside a polygon: 1. And it seems like there's zero literature on text layout algorithms.
Tilemap zoomers: 3.
Infinite tree hierarchy zoomers: 1.
File managers: 1. And 3 web-based file managers.
File thumbnailer libraries: 2.
Thumbnail cache systems: 3.
OpenGL-based presentation apps: 3.
File annotation systems: 1.
Metadata editing forms: 3.
File metadata extraction libraries: 1.
Standard library rewrites: 1.
Array combinator libraries: 2. Well, maybe 3.
Search query parsers: 2, the second time Doing It Right with a parser generator.
Search query matchers: 1.
DOM helper libraries: 2.
Context-sensitive help in REPL: 1.
Distributed message-passing libraries: 2.
Network service discovery libraries: 1.
Media players with networked storage and playback (controller on mobile, storage across the network, playback on HTPC): 1, but durr.
Converter graph with pathfinder to convert files: 1 (IIRC plugged to the media player too, so it played PDFs with text-to-speech).
Animation libraries with tweens and timelines: 1.
SQL databases (tables and indices, not database engines): 5? Not very much.
Memcached to cache results of expensive queries: 1.
HTML fragment caches: 1.
Tar parsers: 1.
3D model parsers: 3 of varying levels of completeness.
Huffman encoder & decoder: 1 (yay, coursework).
Image file formats: One vector drawing hack, plus three that basically concatenate JPEGs into a bigger file.
Toy drawing programs: 3.
Games: 1.

Editors: 0.
Compilers: 0.
Desk calculators: 0.
Programming languages: 0 (unless you count an OO Brainfuck implementation).
Something useful: 0.

Hmm, that was more than I initially thought :<

2010-09-27

Visitor stats for September 2010

Browser market share


43% Firefox
31% Chrome
10% Safari
10% Explorer
4% Opera

OS market share


56% Windows
22% Linux
19% OS X
1.6% iOS
0.7% Android

More than 90% of visitors had a screen width of 1280 or higher.

94% had Flash, 1% had a Flash version other than 10. 85% had Java.

Most search engine visitors wanted to know about SSE matrix multiplication. Clearly a hot topic.

2010-09-25

Composing shaders

I haven't found a good way to do modular GLSL shaders, so here's some blather on that subject.

Shaders themselves are strings of source code, compiled to a binary by the OpenGL drivers. A shader has a void main() that is called to execute the shader. There are also global definitions in the form of attributes, varyings and uniforms. And a precision qualifier like precision highp float;, and C preprocessor macros.

A shader also has a type, it can either be a vertex shader or a fragment shader (or a geometry shader). The difference between the two is that a vertex shader determines what areas to consider for modification, whereas a fragment shader determines the output value for each element in the modified areas. Imagine you're drawing a filled circle: the vertex shader would output the bounding box of the circle and the fragment shader would test each element in the bbox to see if it lies inside the circle, setting them to the fill color if they do.

But back to the composing part. If you have some common helper functions that you'd like to use in your different shaders (e.g. defaultTransform(), blend functions), how do you reuse them? I tried to link different shader objects together but it didn't work for whatever the reason. So now I'm just concatenating strings.

ShaderSource = Klass({
initialize: function(type, source) {
this.type = type;
this.text = toArray(arguments).slice(1).map(function(s) {
return s.text ? s.text : s;
}).join("\n");
}
});

Maybe you want to further split your shaders into functional segments and turn those segments on and off for different uses. One way is to put some #ifdefs in the shader and prepend a block of #defines to your source.

ShaderSource = Klass({
initialize: function(type, source) {
this.type = type;
this.text = this.origText = toArray(arguments).slice(1).map(function(s) {
return s.text ? s.text : s;
}).join("\n");
},

setDefines : function(defines) {
this.defines = defines;
var defs = [];
for (var i in defines) {
defs.push("#define "+i+" "+defines[i]);
}
defs.push(this.origText);
this.text = defs.join("\n");
}
});

Another way would be to affix an identifier to the source strings and only use a string if it's enabled. This is like writing a feature-poor preprocessor by yourself.

ShaderSource = Klass({
initialize: function(type, source) {
this.type = type;
this.segments = toArray(arguments).slice(1);
this.text = this.segments.map(function(s) {
return s.text ? s.text : s;
}).join("\n");
},

setEnabled : function(enabled) {
this.enabled = enabled;
this.text = this.segments.map(function(s) {
if (!s.id || enabled[s.id])
return s.text ? s.text : s;
else
return "";
}).join("\n");
},

setDisabled : function(disabled) {
this.disabled = disabled;
this.text = this.segments.map(function(s) {
if (!s.id || !disabled[s.id])
return s.text ? s.text : s;
else
return "";
}).join("\n");
}
});

Or some mix of the above. I don't really know what's the super-best way to do this.

Easy inheritance in JavaScript

JavaScript inheritance is cumbersome. You have a constructor function and it has a prototype. And inheritance is done by setting the prototype to a newly created parent object? What? That means that you better not do anything in the constructor, otherwise you'll have a royal pain deriving new prototypes from it.

So I did what I do and wrote my own inheritance system, based on mixins. It still functions pretty much like the JavaScript system, but is more convenient for building the constructor and the prototype from existing classes.

Here's the infamous OO geometry demo:

Rectangle = Klass({
initialize : function(w, h) {
this.w = w;
this.h = h;
},

getArea : function() {
return this.w * this.h;
}
})


Square = Klass(Rectangle, {
initialize : function(s) {
Rectangle.initialize.call(this, s, s);
}
})


Moving = Klass({
mass : 1,

initialize : function(n) {
this.s = new Array(n);
this.a = new Array(n);
this.v = new Array(n);
this.f = new Array(n);
for (var i=0; i<n; i++){
this.s[i] = this.a[i] = this.v[i] = this.f[i] = 0;
}
},

move : function(t) {
for (var i=0; i<this.s.length; i++) {
var a = this.a[i], s = this.s[i], v = this.v[i], f = this.f[i];
var newA = f/this.mass;
var newV = v + a*t + (1/2)*(newA-a)*t*t;
var newS = s + v*t + (1/2)*a*t*t + (1/3)*(newA-a)*t*t*t;
this.a[i] = newA;
this.v[i] = newV;
this.s[i] = newS;
}
this.previousMass = this.mass;
}
})


Printable = {
valueString : function() {
var a = [];
for (var i in this)
if (typeof this[i] != 'function') a.push(i);
a.sort();
for (var i=0; i<a.length; i++)
a[i] = a[i] + ": " + this[a[i]];
return a.join(", ");
},

print : function() {
console.log(this.valueString());
}
}


MovingRectangle = Klass(Rectangle, Moving, Printable, {
density : 0.19,
height : 1,
initialize : function(w,h,d) {
Rectangle.initialize.call(this,w,h);
Moving.initialize.call(this,2);
if (d)
this.density = d;
this.mass = this.getVolume() * this.density;
},

getVolume : function() {
return this.getArea() * this.height;
}
})

Klass is a function that builds a constructor. The returned constructor calls its prototype's initialize method.

The constructor's prototype is built by merging together all the arguments given in the Klass call. If an argument has a prototype property defined, the prototype is used for merging. Otherwise the argument is merged as itself.

The built prototype is then finally merged into the constructor function, so that you have access to the prototype functions and properties from the constructor object itself. Which is useful for doing calls to other classes, as you can do e.g. Parent.someMethod.call(this) instead of Parent.prototype.someMethod.call(this).

Klass = function() {
var c = function() {
this.initialize.apply(this, arguments);
}
c.ancestors = toArray(arguments);
c.prototype = {};
for(var i = 0; i<arguments.length; i++) {
var a = arguments[i];
if (a.prototype) {
Object.extend(c.prototype, a.prototype);
} else {
Object.extend(c.prototype, a);
}
}
Object.extend(c, c.prototype);
return c;
}

Object.extend merges the src object's attributes with the dst object, ignoring errors. Ignoring the errors is useful in merging native objects (e.g. styles) as they've got some stuff that can't be overridden.

Object.extend = function(dst, src) {
for (var i in src) {
try{ dst[i] = src[i]; } catch(e) {}
}
return dst;
}

toArray creates a new array from an object that has a length property. Useful for dealing with all the silly Array-like objects such as the function arguments object and the lists returned by the DOM API.

toArray = function(obj) {
var a = new Array(obj.length);
for (var i=0; i<obj.length; i++)
a[i] = obj[i];
return a;
}

2010-09-22

WebGL slides translated

I translated my WebGL slides to English and put them on GitHub Pages. Which is great, btw. Upgraded my account to Micro just to give them props for that.

If you would like to buy an app to make fancy WebGL slideshows, send me a mail and I'll get back to you.



If you'd like to subscribe to a web-based zoomable file management & sync service instead, send me a mail and I'll get back to you.



In other news, I'm back on Twitter.

2010-09-16

Frontend Finland WebGL slides



I gave a talk at the Frontend Finland meetup about WebGL. Had a great time, thanks to the organizers! My presentation slides are available here in glorious Finnish. Best experienced with a browser that has a working WebGL implementation (fear not, there's a fallback to plain unadorned HTML). Writing the slideshow app (and the slides) took about three days, with four days of preliminary library work (and the app dev fed into the library too).

Here are some links on the topics covered:

The git repo for the utility library and the presentation can be found here.

The SVG presentation by Matti Paksula was really nifty as well. After the show, we were brainstorming about some kind of a super slideshow where you'd write the slides in HTML (with a nice CSS), then they'd get rendered using SVG / Canvas / WebGL depending on the capabilities of your browser. (What my presentation is doing is parsing the DOM into scene graph nodes, then rendering them using WebGL. Doing a similar system that rendered using SVG wouldn't be too difficult.)

Preserving the clickability of links and other such webness would be problematic with WebGL, but a mix of SVG, CSS 3D and WebGL might get you the best of both worlds: overlay SVG elems on top of the WebGL canvas, change the CSS 3D transform of the elems to match the WebGL scene graph. That'd be a lot like CAKE's mixed HTML/Canvas, except in 3D. If you want fancy FX on your SVG elems while preserving interactivity, you'd render them to textures and put the original element at opacity 0.01 on top of the fancy render, maybe? Or do shaders for SVG?

2010-09-08

Arithmetic

You have things, some things. Call these one-things. There is also a no-thing, call it 0. When you add one-thing to 0, we call that 1. Add a one-thing to 1 and you get 2. Add one-thing to 2 and you get 3. In that way we build a small list of numbers: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9.

When you add one-thing to 9, we don't have a number for that here. But let's call it a ten-thing and mark it on left of the one-things. We have 1 ten-thing and 0 one-things, written in short as 10. Then we can add a one-thing to the 0 one-things to get 11. When we add a one-thing to 19, we get two ten-things: 20. When we have ten ten-things, we call that a hundred-thing and write it as 100: one hundred-thing, zero ten-things, zero one-things.

In a similar fashion, we could have a shorter list of numbers, say, 0, 1, 2, 3, 4. Now when you add one-thing to 4, you get one five-thing and zero one-things, which we can write as 105. When we have five five-things, we call that a twenty-five-thing and write it as 1005.

There are also negative things. When a negative thing is added to a positive thing, they both disappear. Adding one negative thing to one positive thing makes both disappear, leaving you with zero things. If you add 5 negative things to 8 positive things, the 5 negative things and 5 positive things disappear, leaving 3 positive things. Adding 2 negative things to 3 more negative things gets you 5 negative things. Adding 4 negative things to 2 positive things makes the 2 positive things and 2 negative things disappear, leaving 2 negative things.

When you add 5 negative one-things to 2 positive ten-things, you need to crack open one of the ten-things to get ten positive one-things. The 5 negative things and 5 positive one-things disappear, leaving you with 1 positive ten-thing and 5 positive one-things: 15.

To add 5 negative one-things to 1 positive hundred-thing, you first crack open the hundred-thing to get ten ten-things, then crack open one of the ten-things to get ten one-things. The 5 negative one-things and 5 positive one-things disappear, leaving you with 9 positive ten-things and 5 positive one-things: 95.

Adding a zero to a number results in the number itself. Adding 5 to 0 results in 5. Adding 0 to 5 results in 5.

Negating a number turns a positive number into a negative number and a negative number into a positive number. Positive 2 negated is negative 2. Negative 2 negated is positive 2. Adding a number to its negation is zero. Positive 2 added to negative 2 is zero.

We write positive numbers with a + in front of them and negative numbers with a - in front of them. Positive 2 is +2. Negative 2 is -2. We write addition with a + between the numbers. The + symbol is called plus. Negative 3 plus positive 2 is -3 + +2. We write negate as -(number). Positive 2 negated is -(+2). Negative 2 negated is -(-2).

Two numbers are equal if they are the same. Equality is written with a =. 2 equals 2 is written 2 = 2 and amounts to true. If the two numbers are not the same, the equality operation returns false. If both sides of the equality operation are identical, the equality operation returns true. 2 = 2 is true. 2 = 3 is false. 1 + 1 = 2 is undecided, rewriting it as 2 = 2 or 1 + 1 = 1 + 1 makes it true.

Subtraction is an operation where the first number is added to the negation of the second number. Subtraction is written with a - between the numbers. The - symbol is called minus. -3 minus +2 is written -3 - +2 and means -3 + -(+2).

Multiplication is an operation where each of the things in the first number is replaced by the second number. Multiplying +3 by -2 replaces each of the 3 things with a -2, giving you +(-2) + +(-2) + +(-2) = -6. Multiplying +15 by +3 requires splitting open the ten-things to replace each of the one-things inside them with a +3, giving you +15 +3-things, which amount to +45 when added together. Multiplying -2 by -5 replaces the two things with -5-things, giving you -(-5) + -(-5), which simplifies to +5 + +5 and amounts to +10. Multiplying +0 by -2 gives you zero -2-things, which amounts to zero. Multiplying -2 by +0 replaces the two things with +0, giving you -(+0) + -(+0) = -0 + -0 = -0.

Fractional things are parts of one-things, much like how one-things are parts of ten-things. Fractional things are defined by how many parts are needed to make up a one-thing. When you need two parts for one-thing, the fractional thing is called a half. When you need three parts, the fractional thing is one-third. With five parts needed, it's a one-fifth. Five one-fifths added together amount to one. Similarly three one-thirds amount to one. Fractional numbers are written as 1/parts. One-fifth is written +1/5. One-third is written as +1/3. Negative one-sixth is written -1/6.

Adding 1/6 to 1/6 results in 2/6. Adding 1/6 to 5/6 results in 6/6, which equals 1. Adding 1/6 to 1 is 1/6 + 6/6 = 7/6. Adding 1/5 to 4/5 results in 5/5, which also equals 1. Adding 1/5 to 1 is 1/5 + 5/5 = 6/5.

Multiplication is written with an x between the numbers. The x is called times. 3 times 5 is written 3 x 5 and means 3 multiplied by 5, amounting to 15.

Multiplying a number by one results in the number itself. 5 x 1 is 5. 1 x 5 is 5.

The inverse of a number multiplied with the number itself results in one. The inverse of a number is the fraction with number parts. Zero does not have an inverse. The inverse of +5 is +1/5. The inverse of -3 is -1/3.

Three times the inverse of three is 3 x 1/3 = 1/3 + 1/3 + 1/3 = 3/3 = 1.

Negative two times the inverse of negative two is -2 x 1/-2 = -(-1/2) + -(-1/2) = 1/2 + 1/2 = 2/2 = 1.

Division is an operation where you multiply the first number with the inverse of the second number. 3 divided by 5 is 3 x 1/5 and amounts to 3/5. 8 divided by 2 is 8 x 1/2 and amounts to 8/2, which adds up to 4.

Division is written with a / between the numbers. 5 divided by -3 is written 5 / -3 and means 5 x -1/3, and amounts to -5/3, or -1 + -2/3.

The allowed moves when manipulating an equation with additions are:
  • Apply associativity rule to two additions: a + (b + c) = (a + b) + c
  • Apply commutativity rule to an addition: a + b = b + a
  • Add the same value to both sides of the equality: a = a <=> a + b = a + b

For an equation with multiplications, the above moves apply as well, but there is an additional move when dealing with a mix of additions and multiplications:
  • Apply distributivity rule: a x (b + c) = a x b + a x c

These are the only allowed structural moves for manipulating equations of additions and multiplications.

Additions can use the following equalities for simplifying expressions: a + 0 = a and a + -a = 0. Multiplications can use a x 1 = a and a x 1/a = 1.

We can build more moves from the above, for example:

I'm playing fast and loose with the associativity in these,
a + b + c = (a + b) + c = a + (b + c),
I'm using whichever is the easiest at any given moment.

Also, multiplication binds tighter than addition:
a x b + c x d = (a x b) + (c x d)

x + b = c | a = a <=> a + b = a + b
x + b + -b = c + -b | a + -a = 0
x + 0 = c + -b | a + 0 = a
x = c + -b | a - b = a + -b
x = c - b
x + b = c <=> x = c - b

a x b = c | a = a <=> a x b = a x b
a x b x 1/b = c x 1/b | a x 1/a = 1 when a != 0
a x 1 = c x 1/b | a x 1 = a
a = c x 1/b | a / b = a x 1/b
a = c / b
a x b = c <=> a = c / b when a != 0

(a + b) x c = (a + b) x c | a x b = b x a
(a + b) x c = c x (a + b) | a x (b + c) = a x b + a x c
(a + b) x c = c x a + c x b | a x b = b x a
(a + b) x c = a x c + b x c

a x 1 = a x 1 | a x b = b x a
a x 1 = 1 x a

a x b = a x b | (a + b) x c = c x a + c x b
a x b = (a + -1) x b + (1 x b) | 1 x a = a
a x b = (a + -1) x b + b | a - b = a + -b
a x b = (a - 1) x b + b

-(a) = -(a) | a = b <=> a + b = a + b
a + -(a) = a + -(a) | a + -(a) = 0
a + -(a) = 0 | a = b <=> a + b = a + b
-a + a + -(a) = -a + 0 | a + -a = 0
0 + -(a) = -a + 0 | a + b = b + a and a + 0 = a
-(a) = -a

-1 x a = -1 x a | 0 + a = a
-1 x a = 0 + -1 x a | a - b = a + -b
-1 x a = 0 - 1 x a | 1 x a = a
-1 x a = 0 - a | a - b = a + -b
-1 x a = 0 + -a | 0 + a = a
-1 x a = -a

-1 x a = -a | -a = -(a)
-1 x a = -(a)

-a x b = -a x b | -1 x a = -a
-a x b = -1 x a x b | a x b = b x a
-a x b = a x -1 x b | -1 x a = -a
-a x b = a x -b

-a x b = -1 x a x b | -1 x a = -(a)
-a x b = -(a x b)

a x (b - c) = a x (b - c) | a - b = a + -b
a x (b - c) = a x (b + -c) | a x (b + c) = a x b + a x c
a x (b - c) = (a x b) + (a x -c) | a x b = b x a
a x (b - c) = (a x b) + (-c x a) | -a x b = -(a x b)
a x (b - c) = (a x b) + -(c x a) | a x b = b x a
a x (b - c) = (a x b) + -(a x c) | a - b = a + -b
a x (b - c) = (a x b) - (a x c)

a x 0 = a x 0 | 0 = -1 + 1
a x 0 = a x (-1 + 1) | a x (b + c) = a x b + a x c
a x 0 = a x -1 + a x 1 | a x 1 = a
a x 0 = a x -1 + a | -1 x a = -a and a x b = b x a
a x 0 = -a + a | -a + a = 0
a x 0 = 0

Now go forth and solve the problems of algebra!

Links for Tuesday

A Blender-WebGL exporter got forked.

Rrrola's FX. Mandelboxes create a lot of cool geometry, riding that line between geometric and organic. I wonder if you could build game levels quickly by CSG cutting mandelboxes.

Some motion graphics videos on YouTube.

2010-09-07

WebGL color spaces

There's an interesting discussion happening on the public-webgl mailing list about the color spaces used by the textures and the canvas.

As far as I can tell, the problem is:
  • Simple math works correctly only in a linear color space.
  • Images and the rest of web content is 8 bits per channel sRGB.
  • Converting between two 8-bit color spaces is lossy.
  • You can't tell what an image's color space is from JavaScript.
  • You can't easily convert between color spaces in JavaScript.


The proposed solutions are:
  • Pass image data as-is to WebGL, treat WebGL canvas values as sRGB. (I.e. the do-nothing solution. If application writers want correct math, they should use linear textures and do a final linear-to-sRGB pass on the rendered image.)
  • Optionally convert textures to linear color space, treat WebGL canvas as linear. (Problematic as there is severe information loss at the conversions. Math works correctly though.)

Color spaces? But I thought space was black?


Well, actually, space has no color. It's transparent. It just looks black because there's very little light coming through it at most points.

Color spaces, on the other hand, are definitions of what the numerical pixel values in an image mean. A linear RGB color space means something like "given an output device with three component emitters at wavelengths R=580 nm, G=490 nm and B=440 nm, the number of photons emitted by an emitter is Max_Output * V, where V = component_value / (2^component_bits-1)." For sRGB, the component values are transformed by the sRGB curve (roughly pow(V, 2.2)) to have more control over the important parts of the dynamic range. In particular, sRGB sacrifices precision in the light end to have more control in the darks. Converting linear->sRGB loses detail in the light end, whereas sRGB->linear loses detail in the dark end.

To project an image onto your eyeball, the image data goes through the following wringer: Image data in image color space -> Viewer app color space -> Window system color space -> Display device color space -> Hardware blinkenlichts -> Eyeball.

Now, most software goes "huh?" at the mention of that and acts as if the image data is in the window system color space and uses the data directly. Further, the assumption when doing image editing operations is that the aforementioned color space is linear. (Why that might cause problems is that while 0.25 + 0.25 has the same luminance as 0.5 in a linear color space, to add the luminances in gamma 2.2 you have to do (0.25^2.2 + 0.25^2.2)^(1/2.2) = 0.34.)

Software that requires accurate color handling usually converts the 8-bit values to 16 or 32-bit linear color space to get around the conversion errors and to get easy correct math internally. The resulting high-depth image is then converted down to window system color space for viewing.

The real solution is to use 64 bits per channel and specify the channels to mean photon count / second / mm^2. With that, you can crank from 1 photon / second all the way up to 6500x noon sunlight. And linear math. Then use that as the interchange format between everything. Heyo, massive framebuffers~

In the meantime, maybe the browser could have an iconv-like system for converting images from one color space to another. WebGL 1.0 doesn't have float textures, so upconverting to linear float textures has to wait for the float texture extension.

With linear float textures, you could upconvert all images to float textures, then use a linear float FBO for rendering, and finally convert the FBO to sRGB to display on the canvas element.

2010-09-06

Ramble on sorting

Merge sort merges already sorted regions together. It also has an operation to turn a small region into a sorted region. The merge sort splits its input into small regions, turns each of those into a sorted region, and finally merges them together.

There are some sorted regions that have a trivial merge operation: if the largest value in region X is smaller or equal to the smallest value in region Y, the merge operation is the concatenation X ++ Y. We can pre-process the input of the merge sort so that the two merged regions have this property. The pre-processing pass uses a pivot value and moves all values smaller than the pivot to the start of the region, leaving the end of the region for values larger than the pivot. Now you don't have to do anything to merge the regions together once they're sorted. This is the idea behind the quicksort algorithm.

Because pivoting can cause the regions to have different sizes, there is a worst-case scenario where the pivot is the only element in the other region. Because of this scenario, quicksort has O(n^2) worst-case time complexity. If you pick the pivot to be the median of the region with an O(n) median finding algorithm and further split the equal-to-pivot values equally between the subregions, you can get an O(n lg n) quicksort (but the O(n) median finding algorithm has a high overhead, so they say the resulting algorithm is slower than quicksort in the average case).

You can do a O(n) pre-processing pass over the input data to find all already sorted regions in the input data. If the number of found regions is small, you're dealing with nearly sorted data. You can directly merge all sorted regions that are larger than the region size for the region sorting operation. The merging should be done so that the smallest regions are merged together first. Merging regions of unequal size causes the worst-time complexity to increase (merging a 2-region and a X-region takes X+2 time, consider N/4 2-regions merged into a N/2-region: it takes N/4 * (N/2 + (2+N) / 2) time, which is in O(N^2).)

The pre-processing pass can also reverse regions that are sorted in the opposite direction. And it is probably possible to topologically sort the sorted region descriptors by their min-max-values to find regions that can be merged by concatenation. Another merge trick would be to use binary search on one region to find the span that the other region overlaps, then you could get away with memcpy(dst,smaller,smaller_sz); merge(dst+smaller_sz,overlap,region2,overlap_sz,region2_sz); memcpy(dst+smaller_sz+overlap_sz+region2_sz, larger, larger_sz).

If you keep track of how equally the quicksort pivot pass splits a region, you can avoid the quicksort worst-case by handing pathological splits over to merge sort. That is a bit nasty though, as the resulting regions no longer have the concatenative property and must be merged the slow way. Another alternative is to do something like introsort and switch over to heapsort once recursion depth exceeds lg n.

Once merge sort or quicksort reaches the maximum unsorted region size (say, 8 elements), implementations usually switch over to a low-overhead O(n^2) sorting algorithm like insertion sort.

The partitioning operation of merge sort is a no-op, but the partitioning operation in quicksort takes O(n) time. In the same fashion, the merge operation of quicksort is a no-op, but the merge operation of merge sort takes O(n) time.

You can do the merge sort merge in parallel by picking a pivot and splitting the sorted regions into two regions: one with values smaller or equal to the pivot and the other with values larger or equal to the pivot. The splitting operation consists of finding the index of the pivot in each region using a binary search. You then merge the small values with the small values and the large values with the large values. The merge between the merged smalls and larges is concatenation, so you can just allocate an array with the beginning reserved for the small values and the end reserved for the large values, and run the small and large merges in parallel. Recurse to increase parallelism. The nice thing about this approach is that you get segregated in-place parallelism, each of the sub-partitions can write to its own part of the root target array. The problem it suffers from is the quicksort partitioning problem: the pivot should be the median of the data set. Because the sorted regions are sorted, we can get the medians of each region easily enough, the worst case with one of those should be a 25:75-split when the regions are of equal size.

If your input data is 256 elements long and you split it into 8-element insertion-sorted regions, executing each in parallel (on a 32-core), what's the merge time? The first merge level you want to split into 2-way parallel merges (16 merges -> 32 merges). The second level is split 4-way (8 merges -> 32 merges), then 8-way and 16-way? The split bsearch overhead is 2 log2 mergeSize per level (sub-splits execute in parallel).

To do the 16-way split (assuming we get a perfectly balanced split), we first do a 2 log2 128 = 14 op split, then 2 log2 64 = 12 op split, then 10 op and 8 op, for total split cost of 44 ops. The merge then takes 16 ops to run (8+8-merge), for total merge cost of 60 ops. If we stop the split one level short, it'd take 36 ops to split and 32 ops to merge, 68 ops in total. So I guess you do want to split it all the way down if you have negligible overhead.

The total runtime for the zero-overhead parallel merge tree would be 44 + 30 + 18 + 8 + 4*16 = 164 ops. Total sort time with 64-op 8-elem sorts would be 228 ops. Hey, sub-O(n). Quadruple the input size and the core count and it's 8+18+30+44+60+78+6*16 + 64 = 398 ops. Hey, sub-linear parallel scaling.

There is an another way to do parallel merging as well (cribbed from some lecture slides): for regions A and B, write A[i] to i + B.indexOfLarger(A[i]) and write B[i] to i + A.indexOfLarger(B[i]). N-parallelizable, serial O(N log N) time, parallel O(log N) time. However, it relies on scattered writes, so it doesn't seem very memory-access friendly (ideally each CPU core should be working on its own cache line, otherwise you get cache contention and traffic between the CPUs).

Fun reading: Heapsort, Quicksort, and Entropy.

2010-09-04

Two's complement integers

Oh hey, I finally get how two's complement works after reading the first couple paragraphs of Why computers represent signed integers using two’s complement.

Simply put, adding a negative number to a positive number overflows if the result is positive, and doesn't overflow if the result is negative.

With 32-bit integers

5 - 1 = 5 + (-1) = 5 + (232-1) = 232+4 = 4
5 - 3 = 5 + (232-3) = 232+2 = 2
5 - 7 = 5 + (232-7) = 232-2 = -2

This is kinda interesting, we can put the separator between positive and negative numbers where we like:

void print_custom_int (unsigned int n, unsigned int separator)
{
if (n < separator) {
printf("%u\n", n);
} else {
printf("-%u\n", ~n+1);
}
}

print_custom_int(0, 200);
// 0
print_custom_int(1, 200);
// 1
print_custom_int(-1, 200);
// -1
print_custom_int(200, 200);
// -4294967096
print_custom_int(-4294967097, 200);
// 199

Or maybe you need extra positive range and only 256 negative values:

print_custom_int(4000000000, 0xFFFFFF00);
// 4000000000
print_custom_int(2*2004001006, 0xFFFFFF00);
// 4008002012
print_custom_int(-32-80+15, 0xFFFFFF00);
// -97

// Watch out for overflows!
print_custom_int(-16*17, 0xFFFFFF00);
// 4294967024

And yeah, division doesn't work out of the box.

unsigned int div_custom_int (unsigned int a, unsigned int b, unsigned int separator)
{
if (a < separator && b < separator)
return a / b;
else if (a < separator)
return ~(a / (~b+1))+1;
else if (b < separator)
return ~((~a+1) / b)+1;
else
return (~a+1) / (~b+1);
}

unsigned int n20 = -20, n3 = -3, p4 = 4, p2 = 2;

print_custom_int(n20 / p4, 0xFFFFFF00);
// 1073741819
print_custom_int(p4 / n3, 0xFFFFFF00);
// 0
print_custom_int(p4 / p2, 0xFFFFFF00);
// 2
print_custom_int(n20 / n3, 0xFFFFFF00);
// 0

// vs.

print_custom_int(div_custom_int(n20, p4, 0xFFFFFF00), 0xFFFFFF00);
// -5
print_custom_int(div_custom_int(p4, n3, 0xFFFFFF00), 0xFFFFFF00);
// -1
print_custom_int(div_custom_int(p4, p2, 0xFFFFFF00), 0xFFFFFF00);
// 2
print_custom_int(div_custom_int(n20, n3, 0xFFFFFF00), 0xFFFFFF00);
// 6

2010-08-31

Branching factor generator

A function to generate a stream of tree branching factors to approximate a non-integer branching factor.

branches' x i sum =
let v = (if sum/i > x then floor x else ceiling x) :: Int in
v : branches' x (i+1) (sum+fromIntegral v)

branches n = branches' n 0 0

e_branches = branches (exp 1)
e_100000 = take 100000 $ e_branches
average l = sum l / fromIntegral (length l)
average $ map fromIntegral e_100000 == 2.71828

You could also write the average function as

average2 l = snd $ foldl (\(i, avg) e -> (i+1, (avg*(i-1) + e)/i)) (1, 0) l

average2 $ map fromIntegral e_100000

even though that is less accurate. You can use a similar trick for the branches' function to use a running average instead of a running sum.

branches2' x i avg =
let v = (if avg > x then floor x else ceiling x) :: Int in
v : branches2' x (i+1) ((avg*(i-1)+fromIntegral v)/i)

branches2 n = branches2' n 1 0
average $ map fromIntegral $ take 100000 $ branches2 (exp 1)

To generate a tree of wanted size out of the branching factor list, you'd have a tree generator that eats values off the list and generates tree nodes breadth-first. It is kind of a pain to write.

And some Tuesday links

Man-Computer Symbiosis by J.C.R. Licklider. Continuing on the HCI vision golden oldies hitlist.

Twin hurricanes.

A Dust Mite.

2010-08-28

Folding, part 3: parallel folds

So, you can do a parallel fold if your folding function is associative.

-- Here i should be the neutral element of f, that is f i x = x.
-- E.g. 0 for + and 1 for *
parFold f i [] = i
parFold f i [x] = f i x
parFold f i xs =
l `par` r `pseq` f l r
where l = parFold f i left
r = parFold f i right
left, right = splitAt (length xs `div` 2) xs

-- See how associative functions work right with it
parFold (+) 0 [1..10] == foldl (+) 0 [1..10]
parFold (*) 1 [1..10] == foldl (*) 1 [1..10]

-- But others have problems
parFold (-) 0 [1..10] /= foldl (-) 0 [1..10]
parFold (/) 1 [1..10] /= foldl (/) 0 [1..10]

-- As - and / are defined as A + -B and A * 1/B,
-- we can work around these instances
parFold (+) 0 (map negate [1..10]) == foldl (-) 0 [1..10]
parFold (*) 1 (map recip [1..10]) == foldl (/) 1 [1..10]

But for the general case, we need something stronger. Roll in the reduceFold! It splits the foldable list into sublists, folds each sublist, and reduces the subresults into a single value.

reduceFold reduce f init [] = init
reduceFold reduce f init [x] = f init x
reduceFold reduce f init xs =
l `par` r `pseq` reduce l r
where l = reduceFold reduce f init left
r = reduceFold reduce f init right
(left, right) = splitAt (length xs `div` 2) xs

-- Associative operations work as usual
reduceFold (+) (+) 0 [1..10] == foldl (+) 0 [1..10]
reduceFold (*) (*) 1 [1..10] == foldl (*) 1 [1..10]

-- And we can now combine the subresults in a different fashion
reduceFold (+) (-) 0 [1..10] == foldl (-) 0 [1..10]
reduceFold (*) (/) 1 [1..10] == foldl (/) 1 [1..10]

Let's write some parallel list operations in terms of reduceFold.

parMap f lst = reduceFold (++) (\l i -> l ++ [f i]) [] lst
parFilter f lst = reduceFold (++) (\l i -> if f i then l++[i] else l) [] lst
parConcat lsts = reduceFold (++) (++) [] lsts
parConcatMap f lst = reduceFold (++) (\l i -> l++(f i)) [] lst

optLeft (Just x) _ = Just x
optLeft Nothing (Just x) = Just x
optLeft Nothing Nothing = Nothing

optFilter f i = if f i then Just i else Nothing

parFind f lst = reduceFold optLeft (\l i ->
optLeft l (optFilter f i)) Nothing lst

optRight x y = optLeft y x

parFindLast f lst = reduceFold optRight (\l i ->
optRight l (optFilter f i)) Nothing lst

-- Yeah, they pretty much work
divisibleBy n x = x `mod` n == 0

parMap (+ 10) [1..10] == [11..20]
parFilter (divisibleBy 3) [1..15] == [3,6,9,12,15]
parConcat [[1], [2,3,4], [5,6,7], [8,9], [10..20]] == [1..20]
parConcatMap (\x -> [x,x,x]) [1..3] == [1,1,1,2,2,2,3,3,3]
parFind (divisibleBy 127) [90,93..900] == Just 381
parFind (> 10) [1..10] == Nothing
parFindLast (divisibleBy 127) [90,93..900] == Just 762

You might've noticed that both parFold and reduceFold are creating a parallel evaluation spark for each element in the lists. This is not a very good idea as the sparking overhead is often going to dominate the execution time and kill performance.

So we need some way to control the reduce tree branching factor and clump up the leaf-level folds into longer operations. I don't know if I can manage to come up with an automagic or even semi-automagic way to do this. I would like to use yesterday's theoretical math as guidance for the tree shape optimizer. Maybe we'll find out in the next installment.

A small hill-climb optimizer


hillClimb f current delta error 0 = current
hillClimb f current delta error steps | delta < error = current
hillClimb f current delta error steps =
if curVal > prevVal && curVal > nextVal
then hillClimb f current (delta / 2) error (steps-1)
else if prevVal > nextVal
then hillClimb f (current-delta) delta error (steps-1)
else hillClimb f (current+delta) delta error (steps-1)
where curVal = f current
prevVal = f (current-delta)
nextVal = f (current+delta)

May have bugs in it, only tested on (\x -> -(x**2)).

2010-08-27

Folding, part 2: parallel reduction math

Parallel folding is a bit of an oxymoron. The fold operation sequentially builds up an accumulator value from the elements of a data structure. So if it's inherently sequential, how can it be parallelized?

If the reduce function given to fold is associative, the fold can be parallelized. If it's not associative, the fold is sequential.

Associativity means that you can turn (a + b) + c into a + (b + c), or, in function call terms, (f (f a b) c) == (f a (f b c)). When you have a long call chain like a + b + c + d + e + f + g + h, it means that you can split it into (a + b) + (c + d) + (e + f) + (g + h), evaluate the parenthesized bits in parallel, and then reduce the results to the final value. The reduction can also be parallelized, as it is the fold (a_b + c_d) + (e_f + g_h). This sort of parallel tree reduction is what map-reduce is all about btw: compute a bunch of values in parallel and reduce them to the result value.

Where it all goes by the wayside


Let's do the math to figure out the theoretical runtime for a reduction network. One parallel reduction step takes computeTime time. Moving the result to the next reduce node takes totalResultSize / bandwidth time. The variable totalResultSize is defined as resultSize * branchingFactor, as the branching factor is the amount of nodes sending their reduce results to the reduce node. We can also split it into a more useful form branchingFactor * (resultSize / bandwidth) and write transferTime = resultSize / bandwidth Finally, the amount of parallel reduction steps required to reduce a tree of size N is log_branchingFactor N (and 1 < branchingFactor <= N).

Putting it all together, we get something like

treeReduceTime n branchingFactor computeTime transferTime =
stepCount * stepTime
where stepTime = computeTime + (branchingFactor * transferTime)
stepCount = if branchingFactor == 1.0
then n
else log n / log branchingFactor

From the above we can come up with a couple of generalizations:

If computeTime is O(1) with regard to branchingFactor and you have infinite bandwidth or zero result size (both mean that transferTime is zero), you want a branching factor of N. That's easy enough to see:

With infinite bandwidth, the treeReduceTime is
(computeTime + (branchingFactor * 0)) + stepCount =
computeTime * stepCount

To minimize the above, we need to minimize stepCount.
The smallest stepCount we can have is 1.
The stepCount is 1 when branchingFactor is N.


If you have infinitely fast compute nodes (computeTime = 0), you want a branching factor of e. That's a bit tougher to prove[1] (or maybe I'm just lacking some essential bit of knowledge that would make the proof easier.)

Note that it's highly likely that computeTime depends on branchingFactor. If the reduce algorithm is O(n), computeTime = branchingFactor * elementComputeTime, and in that case the optimum branching factor is e (as elementComputeTime and transferTime have the same factor, they can be rolled together and the infinitely fast compute nodes proof applies). For O(n^2) algorithms, the optimal branching factor seems to be sqrt(e).

For O(1) computeTime, finite computational power and non-zero transfer time, the optimal branching factor is somewhere between e and N. I used a simple hill-climb optimizer to find optimal branching factors for different computation/transfer ratios, and they do seem to be independent of N (well, apart from the maximum branching factor). For 1:1 compute time : transfer time, the branching factor is around 3.6. For 1:10 (low bandwidth), the branching factor is a bit over 2.8. For 10:1 (slow compute), the branching factor is about 8.6. Here's some approximate ratios for integer branching factors: 3 -> 1:3.5, 4 -> 1.5:1, 5 -> 3:1, 6 -> 5:1, and for 1000:1 the factor is about 226.

The above approximations can be used for branching factor -dependent computation times too. You do that by replacing the computation time with per-reduce-step overhead and combining computeTime in transferTime. The new ratio is overhead : (computeTime+transferTime). For intuition on this: If the overhead dominates, you want more branching as that will decrease the contribution of the overhead. If you have very little overhead, you want a branching factor close to e.

For an example, consider an email tournament that aims to pick the best email out of ten thousand. You receive some fixed amount of emails and your task is to pick the best one, then forward it to your assigned receiver (who does the same, etc., and the last reducer sends the mail to the award ceremony). Let's estimate that the transfer time is 5 seconds, the computing time is a minute per email, and the fixed overhead is 8 hours. This gives us an overhead:processing-ratio of 443. If each receiver processes 10 emails per session, running the tournament would take 33 hours. But if each receiver does 118 emails a go, the tournament would be done in 19.5 hours. And if they do 200 emails, it'd take ... 20.2 hours. (Yeah, 118 is the best branching factor here.)

Suppose you do the same tournament but with people glued to their mail apps. Now the overhead might be just one minute. This time, doing the tournament with 118 mails per session would take a snappy 4.1 hours. But as the role of overhead has fallen, we should reduce the branching factor as well. With 4 mails per session, the tournament would take only 35 minutes. (With 2 mails per session, 42 minutes. 4 is optimal.)


With infinitely fast compute nodes, the treeReduceTime is
(branchingFactor * transferTime) * stepCount

For a branching factor of 1, that equals
transferTime * n

For other branching factors, the equation is
b * t * log_b(n), where b = branchingFactor and t = transferTime

From which we see that t is a fixed factor,
so the minimum of the above equation is the minimum of the equation
b * log_b(n)

Which can be written as
log_b n = log n / log b, b > 1
b * log_b(n) =
b * (log n / log b) =
(b / log b) * log n

From which we see that log n is a fixed factor,
so the minimum of b * t * log_b(n) is the minimum of
b / log b

To find the minimum, find the derivative
D(b/log b) = (1*log b - b*1/b) / (log b)^2
= (log b - 1) / (log b)^2

And its inflection point at b = e,
(log e - 1) / (log e)^2 = (1 - 1) / 1^2 = 0 / 1 = 0,
we check the values of b / log b on both sides of the
inflection point to be larger than the value at the inflection point,
e / log e = e =~ 2.718
2 / log 2 =~ 2.885
3 / log 3 =~ 2.731,
thus confirming it as a local minimum.

Checking the discontinuity of b / log b at b = 1,
b / 0 = infinity,
and discovering it to be larger than e / log e,
we find the global minimum of b / log b to lie at b = e.

Plugging that back into the original equation:
minimum of branchingFactor * transferTime * stepCount is
transferTime * min(n, e*ln(n))
and because n >= e*ln(n), the minimum is
transferTime * e * ln(n).

2010-08-26

Folding, part 1

The fold function is a data structure accumulator: it goes through every element in the data structure and builds up an accumulator value from them. An example for folding lists:

foldl f acc [] = acc
foldl f acc (x:xs) = foldl f (f acc x) xs

With a fold you can do maps and filters of different kinds. Here's a few examples:

-- id x = x
reverseMap f lst = foldl (\l i -> (f i):l) [] lst
reverse = reverseMap id
map f lst = reverse (reverseMap f lst)

filter f lst =
reverse (foldl (\l i ->
if f i
then i:l
else l
) [] lst)

concat a b = foldl (\l i -> i:l) b (reverse a)

concatMap f lst = foldl (\l i -> concat l (f i)) [] lst

triplicate lst = concatMap (\i -> [i,i,i]) lst

drop n lst =
reverse $ snd $ foldl (\(c,l) i ->
if c >= n
then (c, i:l)
else (c+1, l)) (0,[]) lst

You can also do find operations with fold, but they're not as efficient as you'd like, because the fold goes through all the elements in the list. To make find efficient, we need a fold with break.

findInefficient f lst =
foldl (\e i ->
case e of
Nothing -> (if f i then Just i else e)
Just x -> e) Nothing lst

data Break a = Break a | Continue a

foldlWithBreak f acc [] = acc
foldlWithBreak f acc (x:xs) =
case f acc x of
Break a -> a
Continue a -> foldlWithBreak f a xs

find f lst =
foldlWithBreak (\e i ->
if f i
then Break (Just i)
else Continue Nothing) Nothing lst

take n lst =
reverse $ snd $ foldlWithBreak (\(c, l) i ->
if c >= n
then Break (c,l)
else Continue (c+1, i:l)) (0, []) lst

any f lst =
-- maybe False (const True) (find f lst)
case find f lst of
Just x -> True
Nothing -> False

all f lst =
-- not $ any (not.f) lst
not (any (\x -> not (f x)) lst)

And that's it for today. Tomorrow, parallel folding.

FormData file uploads

XMLHttpRequest Level 2 FormData is super. Loving it.

var fd = new FormData();
fd.append("key", "value");
var files = dropEvent.dataTransfer.files;
for (var i=0; i<files.length; i++)
fd.append("file", files[i]);

var xhr = new XMLHttpRequest();
xhr.open("POST", "/upload");
xhr.send(fd);

2010-08-25

sse.h update, HTML5 fiddling, more links

I updated sse.h with license info (MIT) and added software implementations of recip() and rsqrt() to double2 and double4. This Vector Classes library might be more useful for Real Serious Use, though.

Currently fiddling with HTML5 media tags (cursed be audio and video codecs. If it were possible to implement media codecs in JS, that would be a solution.) Also doing drag-and-drop file uploads. The event API is ... interesting. To receive a drop event, you need to preventDefault other drag-related events.

Failures are strengths.


Have some links to go:

Eruptions blog readership.
Activity at Sakurajima Volcano Intensifies.
Region-based memory management, ML Kit, A Simplified Account of Region Inference, Combining Region Inference and Garbage Collection, A Safe Runtime System in Cyclone.

2010-08-22

Random links for another day

PacketShader: a GPU-Accelerated Software Router - eeh?

Enceladus plumes with anigifs. Cassini just keeps taking these awesome shots of the Saturn system.

Kirkenes harbour (and googling with the name of the ship got me this. Take that for good measure as well.)

Fantoft stave church, Pre-Colombian Mexican architecture

... the deep-sea food web is fueled by a rain of dead plants and animals from surface waters. Sediment for the sediment god, let the red clay sink back into the mantle.

Cryosphere Today.

That reminds me: We inhabit a few meter thick layer at the solid-gas interface of a 6500 km radius sphere of rock and iron. Go up or down a few meters and you'll die. Go to the solid-liquid or gas-liquid interface and you'll die. Go to the solid, go to the liquid or go to the gas and you'll die.

Bismuth crystal. I quite like the Mineralogical Record print mag. Lots of good pictures.

2010-08-21

Random links of the day

Use of Ada in a Student CubeSat Project (via Ada-Europe 2008 via SPARK Conference Papers). First they built some arctic sea ice buoys and then used the same HW/SW-platform to do a CubeSat, in a high-integrity dialect of Ada. (The conference proceedings have some other wacky stuff too, like a presentation about porting a naval command system used in Royal Navy destroyers to Ada 2005. And yes, the slides are basically a long list of stuff that's changed between language versions, so it's not that exciting.)

Neptune about to complete first orbit since discovery. Discovered in 1846, orbital period 165 years, orbit complete next summer. Celebration? (Via Uncertain Principles.)

The dangers of plein air. That reminds me, I've fallen off the drawing routine. Perhaps I should unfall. Also this.

"According to the meteorologists, it's never summer in Gamvik". The coast of the Arctic Sea is a jolly good place. Full of fun and excitement and days that last for months.

2010-08-19

Random programming ideas

  • Programming is about gathering knowledge about a piece of code. What does it do, what are its limitations, how does it perform, where can it be used, what is it doing right now, how do I extend it, how much memory does it use, how do I know that it works right, etc.
  • Testing, logging, documentation and profiling as an integral part of a language. Parse each function and module into a {body, tests, docs, logstream, profilestream}-struct that can be fiddled with at runtime.
  • Automatically test a function based on its type. Call it with different args, log the results and summarize detected patterns to give programmer a quick description of what the function does and what its time-space-complexity looks like. Add a way to flag found properties as want/do not want and check that subsequent versions of the function fulfill those (basically autogenerating QuickCheck rules).
  • Typed UNIX pipes. Use HTTP headers to tell the recipient what's coming through.
  • Auto-generate type->type-pipelines by putting conversion functions in a graph, run graph search on it with edge weight given by amount of information lost in each conversion (and secondarily performance).
  • Semantic type information to encode the effects of a function. Two functions that purport to do the same thing may well have semantic differences: suppose two image scaling functions where one assumes linear color space and the other does gamma correction. Both do ((RGB w h), new_width, new_height) -> (RGB new_width new_height), but the results differ.
  • Concurrency based on HW-level message passing. Processors read and write in cache lines, cache lines can't be shared between two processors => use cache lines as messages.
  • Array programming-like parallel array computation lib. Element-wise operations on N-wide vectors are easy to parallelize and vectorize: divide N by the amount of processors and the SIMD width. Needs to compile array-level pipes ((f . g . h) A) into element-level pipes (map (f' . g' . h') A) to maximize arithmetic per memory access. Maybe Data Parallel Haskell does this stuff already.
  • Uniform batteries-included interface for different data structures. Yeah, doable with typeclasses. Yeah, something like OCaml Batteries Included Enums. You can reduce data structures to either empty, fold and unfold (for streaming data structures) or alloc, length, get and set (for random access structures).
  • Data structure -generic regular expressions (i.e. not tied to just strings), are they useful? See proof of concept. What you need is a fold over the data structure (and I guess that most data structures are foldable, how else would you free the memory their elements use?)
  • Instruction set with bounded & bordered LOADs and STOREs: LOAD reg addr minAddr maxAddr [borderValue]. Throw hardware exception if a bounded LOAD goes out of bounds, load borderValue if a bordered LOAD goes out of bounds. Bordered STORE writes to the address in borderValue instead. Faster than two compares and JMP? More compiler-friendly? Eliminates buffer overflows? (Random literature search: Security improvement in embedded systems via an efficient hardware bound checking architecture, Checking Array Bound Violation Using Segmentation Hardware, A Comprehensive Approach to Array Bounds Check Elimination for Java, x86 segmented access mode, some mainframe architectures are said to have something like that, mysteries of life.)

2010-08-10

A slow cache algorithm and a faster one

It took around 30 ms to draw some text titles. The weird thing was that not drawing the titles didn't really affect the draw time. So clearly the problem was not in the drawing speed. Investigating, my text bitmap LRU cache was hitting its max size, destroying all possible benefits of caching. But making the cache large enough didn't fix the performance problem. A mystery!

On closer look, my caching algorithm was retarded:

function makeCache(maxSize) {
return { array: [], hash: {}, maxSize: maxSize };
}

function cacheItem(cache, item) {
// shift old items off the array
while (cache.array.length >= cache.maxSize) {
var it = cache.array.shift();
delete cache.hash[it.key];
}
cache.array.push(item);
cache.hash[item.key] = item;
}

// move item with key to the end of cache array
function refresh(cache, key) {
var it = cache.hash[key];
var idx = cache.array.indexOf(it);
cache.array.splice(idx,1);
cache.array.push(it);
}

function get(cache, key) {
refresh(cache, key);
return cache.hash[key];
}

Why that is bad: O(n) get, O(n) cacheItem after hitting maxSize. If you're going through the whole cache every frame, that's O(n^2). If you're also at maxSize, double that! Aargh!

So I rewrote it to be less insane and now the drawing is nice and fast:

function makeCache(maxSize) {
return { array: [], hash: {}, maxSize: maxSize, lastUsedIndex: 0 };
}

function cacheItem(cache, item) {
// chop off the oldest items after reaching overflow limit
if (cache.array.length >= cache.maxSize * 2) {
cache.array.sort(function(a,b){ return a.lastUsed - b.lastUsed; });
var deleted = cache.array.splice(cache.maxSize);
for (var i=0; i<deleted.length; i++) {
var it = deleted[i];
delete cache.hash[it.key];
}
}
cache.array.push(item);
cache.hash[item.key] = item;
}

// update the cached item's lastUsed
function refresh(cache, key) {
cache.hash[key].lastUsed = cache.lastUsedIndex++;
}

function get(cache, key) {
refresh(cache, key);
return cache.hash[key];
}

The gets are now O(1) and cacheItem with maxSize cache runs in amortized O((2n log 2n) / n) time. Or something. You could further optimize it by using a split instead of sort. Basically run an O(n) selection algorithm to find the maxSizeth element of the cache array, then use that as a pivot and do a single quicksort partition pass. That'd give you amortized O(2n / n), which is close enough to O(1) :P

2010-07-27

Berlin!



Hanging out in Berlin this week, with a shiny new sketchbook to boot.

Went to the Pergamonmuseum on Sunday to be all Ozymandias. The things they have there are huge (indoor display of 15m high pillars, etc.) and pretty old (2-3 thousand years) and beautiful. And messed up in that ancient style of messed-upness. The objects in the Islamic museum side were nicee, superior metalcraft and calligraphy (also a millennium or two younger than the rest of the stuff but who's counting). The Assyrian reliefs had writing going across them, like a textured strip of cuneiform. They had a special display on the coloring of the Greek statues, with brightly colored reproductions of much RGB might.

Blog Archive