As we know, the core conceit of sparse files is that when you write to a file, and the start of the write is past the current file length, that space is filled with synthetic zero bytes, which (to the precision of some block size) aren't stored on disk. The definition in the excerpt below agrees with this. They are first present in V7 unix, in spite of having been described in this form since V1.
Per UNIX Programmer's Manual, K. Thompson & D. M. Ritchie, November 3, 1971, format of file system, p. 3,
If block b in a file exists, it is not necessary that all blocks less than b exist. A zero block number either in the address words of the i-node or in an indirect block indicates that the corresponding block has never been allocated. Such a missing block reads as if it contained all zero words.
and a picture is worth 861 words, so, filtered for relevant files:
:login: root
root
# chdir /tmp
# cat >big.s
start:
mov $1, r0
sys write; start; 10
mov $1, r0
sys seek; 4087.; 0
sys write; start; 10
sys exit
# ed big.s
105
/4087/
sys seek; 4087.; 0
s/4087/65527/
w large.s
106
q
# df
723+2364
# as big.s
I
II
# a.out >big
# as large.s
I
II
# a.out >large
# df
718+2364
# ls -l
total 22
124 sxrwrw 1 root 84 Jan 1 00:00:00 a.out
123 s-rwrw 1 root 4095 Jan 1 00:00:00 big
121 s-rwrw 1 root 105 Jan 1 00:00:00 big.s
126 l-rwrw 1 root 65535 Jan 1 00:00:00 large
125 s-rwrw 1 root 106 Jan 1 00:00:00 large.s
...
45 s-rwr- 1 root 142 Jan 1 00:00:00 utmp
i-node 41: USED DIR 0604 links=7 uid=0 size=70
41[ 4]: 44 tmp
i-node 44: USED DIR 0604 links=2 uid=0 size=200
44[16]: 126 large
44[17]: 125 large.s
44[18]: 123 big
44[19]: 121 big.s
i-node 126: USED REG LARGE 0606 links=1 uid=0 size=65535
i.dskp: 304 0 0 0 0 0 0 0
indir0: 303 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 305 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
i-node 125: USED REG 0606 links=1 uid=0 size=106
i.dskp: 301 0 0 0 0 0 0 0
i-node 123: USED REG 0606 links=1 uid=0 size=4095
i.dskp: 300 0 0 0 0 0 0 302
i-node 121: USED REG 0606 links=1 uid=0 size=105
i.dskp: 297 0 0 0 0 0 0 0
# df
718+2364
# cat big
@ @@ @# df
712+2364
# cat large
@ @@ @# df
586+2364
i-node 41: USED DIR 0604 links=7 uid=0 size=70
41[ 4]: 44 tmp
i-node 44: USED DIR 0604 links=2 uid=0 size=200
44[16]: 126 large
44[17]: 125 large.s
44[18]: 123 big
44[19]: 121 big.s
i-node 126: USED REG LARGE 0606 links=1 uid=0 size=65535
i_dskp: 304 0 0 0 0 0 0 0
indir0: 303 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 305 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
i-node 125: USED REG 0606 links=1 uid=0 size=106
i_dskp: 301 0 0 0 0 0 0 0
i-node 123: USED REG 0606 links=1 uid=0 size=4095
i_dskp: 300 306 307 308 309 310 311 302
i-node 121: USED REG 0606 links=1 uid=0 size=105
i_dskp: 297 0 0 0 0 0 0 0
# df
/dev/rk0 542
# chdir /tmp
# cat >big.s
start:
mov $1, r0
sys write; start; 10
mov $1, r0
sys seek; 4087.; 0
sys write; start; 10
sys exit
# ed big.s
105
/4087/
sys seek; 4087.; 0
s/4087/65527/
w large.s
106
# df
/dev/rk0 540
# as big.s
# a.out >big
# as large.s
# a.out >large
# ls -l
total 141
-rwxrwxrwx 1 root 84 Jun 12 19:59 a.out
-rw-rw-rw- 1 root 4095 Jun 12 19:59 big
-rw-rw-rw- 1 root 105 Jun 12 19:59 big.s
-rw-rw-rw- 1 root 65535 Jun 12 19:59 large
-rw-rw-rw- 1 root 106 Jun 12 19:59 large.s
-rw-r--r-- 1 root 144 Jun 12 19:50 utmp
# df
/dev/rk0 534
# cat big large
@ @ @ @ #
# df
/dev/rk0 402
Q.E.D., with same wording in UNIX Programmer's Manual, Fourth Edition, K. Thompson & D. M. Ritchie, November, 1973, file system (V).
# df
/dev/rk0 444
# chdir /tmp
# cat >big.s
start:
mov $1, r0
sys write; start; 10
mov $1, r0
sys seek; 4087.; 0
sys write; start; 10
sys exit
# ed big.s
105
/4087/
sys seek; 4087.; 0
s/4087/65527/
w large.s
106
# df
/dev/rk0 442
# as big.s
# a.out >big
# as large.s
# a.out >large
# ls -l
total 140
-rwxrwxrwx 1 root 84 Mar 21 12:10 a.out
-rw-rw-rw- 1 root 4095 Mar 21 12:10 big
-rw-rw-rw- 1 root 105 Mar 21 12:09 big.s
-rw-rw-rw- 1 root 65535 Mar 21 12:10 large
-rw-rw-rw- 1 root 106 Mar 21 12:09 large.s
# df
/dev/rk0 436
# cat big large
@ @ @ @ #
# df
/dev/rk0 304
Q.E.D., with unchanged wording on p. 2 of UNIX Programmer's Manual, Fifth Edition, K. Thompson & D. M. Ritchie, June, 1974, file system (V).
# df
/dev/rk0 958
/dev/rk1 937
/dev/rk2 bad free count
192
# cat >big.s
start:
mov $1, r0
sys write; start; 10
mov $1, r0
sys seek; 4087.; 0
sys write; start; 10
sys exit
# ed big.s
105
/4087/
sys seek; 4087.; 0
s/4087/65527/
w large.s
106
# df
/dev/rk0 956
/dev/rk1 937
# as big.s
# a.out >big
# as large.s
# a.out >large
# df
/dev/rk0 950
/dev/rk1 937
# ls -l
total 142
-rwxrwxrwx 1 root 84 Oct 10 13:37 a.out
-rw-rw-rw- 1 root 4095 Oct 10 13:37 big
-rw-rw-rw- 1 root 105 Oct 10 13:36 big.s
-rw-rw-rw- 1 root 65535 Oct 10 13:37 large
-rw-rw-rw- 1 root 106 Oct 10 13:36 large.s
# cat big large
@ @ @ @ #
# df
/dev/rk0 818
/dev/rk1 937
Q.E.D., untouched in UNIX Programmer's Manual, Sixth Edition, K. Thompson & D. M. Ritchie, May, 1975, file system (V).
# chdir /tmp
# cat >big.c
main() {
long off = 4087;
write(1, main, 10);
lseek(1, off, 0);
write(1, main, 10);
}
# ed big.c
73
/4087/
lseek(1, 0, 4087);
s/4087/65527/
w large.c
74
# df
/dev/rp0 984
/dev/rp3 297416
# cc big.c
# a.out >big
# cc large.c
# a.out >large
# df
/dev/rp0 975
/dev/rp3 297416
# ls -l
total 14
-rwxrwxr-x 1 root 584 Dec 31 19:06 a.out
-rw-rw-r-- 1 root 4097 Dec 31 19:06 big
-rw-rw-r-- 1 root 73 Dec 31 19:05 big.c
-rw-rw-r-- 1 root 10 Dec 31 19:06 large
-rw-rw-r-- 1 root 74 Dec 31 19:05 large.c
# cat big large
w <$uww <$uww >%uw >%u#
# df
/dev/rp0 975
/dev/rp3 297416
which for the first time correctly behaves according to the still-intact paragraph in UNIX Programmer's Manual, Sixth Edition, K. Thompson & D. M. Ritchie, May, 1975, file system (V), with the corrected behaviour corresponding to readi() zeroing an unused I/O buffer if the address of the data block is 0.
I noticed this when writing a parser for the V1 filesystem, because I forgot about this case. And then I noticed that when writing the parser for V4 filesystems I based it on, I'd also forgotten about this case (the integrity of the v4root.tar dump is unaffected: there are no sparse files on the installation tape rootfs). And then I realised I didn't think I'd seen these branches in V1 mget when reading it for the previous post, either.
unix file system files contain 8 slots for links to 512-byte disk blocks. If the file fits within that size, i.e. is no larger than 4KiB, it is "small", and those links point to data blocks directly. In the samples above, big.[sc] creates a 4KiB file with data in the first 8 bytes and the last 8 bytes, so the rest is sparse: we see this in rf0 dump 1, where the start of big (file 123 (I didn't renumber these, it's just fortuitous)) is contained in block 300 and the end — in block 302. The middle un-written-to blocks are 0 (not allocated).
Bigger files are deemed "large" and the 8 slots point to "indirect" blocks, which themselves contain 256 slots for disk block links. large.[sc] makes a 64KiB file in the same manner as big.?, so we observe that large (file 126) contains one direct link to block 304, which links to blocks 303 and (appropriately further on) 305. The contents of those blocks are the same as 300 and 302. (It's impossible to make a bigger (sparse or otherwise) file in V1 because all file offsets are 16 bits; this is immaterial. The maximum theoretical size of a large file is 1MiB, and later unixes allow addressing this in decreasingly hacky ways.)
To wit: a unix file system file consists of a sequence of "(read block n) or (insert 512 zero bytes)", and reading it sequentially consists of traversing that list, in order, until you've read as much data as the file is large. Small files inline that sequence for performance, large files keep it in chunks on disk (and if a chunk is missing, that's equivalent to 256× "insert 512 zero bytes"). If implemented this way, our original definition (and description from the manual) is fulfilled.
But, as observed above, this is not how unix (before V7) implements this: in the cat log, we can see that just reading the file allocates every formerly-sparse block: df loses 6 (8 − 2) and 126 (also the amount of sparse blocks between 300 and 302 in the indirect block) blocks respectively, and in the subsequent dump we observe this directly: every previously-zero block within the bounds of the file was allocated (unsparsed) and now actually links to a block full of zeroes.
To see why, we can trace the
sys read (II)
path through
sysread
→ readi-node
→ dskr (the read routine for non-special files)
→ mget
(noting that sys write (II) is basically the same but memcpys the other way;
comments verbatim from
Preliminary Release of UNIX Implementation Document,
J. DeFelice,
6/20/72
except where [indicated],
_-prefixed names expositional,
drum
is what they sometimes call the small disk containing the rootfs at this time):
struct {
int flgs;
char nlks;
char uid;
int size;
int dskp[8];
int ctim[2];
int mtim[2];
} i; // currently-open i-node
struct {
// (more spilled process state)
int *fofp;
int count;
} u;
extern r1, ret();
dskr(_ino/*, u.fofp, u.count*/) {
r1 = _ino;
/*i =*/ iget(/*r1*/); // get i-node (r1) into i-node section of core
r2 = i.size - *u.fofp; // file size […] subtract file offset
if(r2 <= 0)
goto *ret;
if(r2 <= u.count)
u.count = r2;
/*r1 =*/ mget(/*i, u.fofp*/); // […] physical block number of block […] where offset points
/*r5 =*/ dskrd(/*r1*/); // read in block, r5 points to 1st word of data in buffer
/*r1, r2, r3 =*/ sioreg(/*u.fofp, u.count, r5*/);
_memcpy(r1, r2, r3); // move data from buffer into working core […]
if(u.count == 0)
goto *ret;
else
goto *dskr;
}
extern r1, r2, r5, mq;
mget(/*i, u.fofp*/) /*-> r1*/ {
mq = *u.fofp >> 8; // divide […] by 256.
r2 = mq;
if(i.flgs & 010000)
goto _large;
if(r2 & 0xFFF0)
goto _small2large;
r1 = i.dskp[r2/2];
if(r1 == 0) { // if physical block num is zero then need a new block for file
/*r1 =*/ alloc(); // allocate a new block
i.dskp[r2/2] = r1; // physical block number stored in i-node
setimod(); // [mark i-node modified, update mtim]
clear(/*r1*/); // zero out disk/drum block just allocated
}
return;
_small2large: // adding on block which changes small file to a large file
// transfer old physical block pointers into new indirect block for the new large file
/*r1 =*/ alloc();
/*r5 =*/ wslot(/*r1*/);
_memcpy(r5, i.dskp, sizeof(i.dskp));
_memset(i.dskp, 0, sizeof(i.dskp));
// clear rest of data buffer
_memset(r5 + sizeof(i.dskp), 0, 512 - sizeof(i.dskp));
dskwr(/*r1*/);
i.dskp[0] = r1; // put pointer to indirect block in i-node
i.flgs |= 010000; // set large file bit […]
setimod();
goto *mget;
_large:
r2 &= 0xFF << 1; // […] offset in indirect block
int _in_indir = r2;
r2 = mq >> 8; // divide byte number by 256.
r2 &= 0xF;
r1 = i.dskp[r2/2];
if(r1 == 0) { // if no indirect block exists
/*r1 =*/ alloc(); // allocate a new block
i.dskp[r2/2] = r1; // put block number of new [indirect] block in i-node
setimod(); // [mark i-node modified, update mtim]
clear(/*r1*/); // [zero out disk/drum block just allocated]
}
/*r5 =*/ dskrd(/*r1*/);
int _indir = r1; // save block number of indirect block on stack
r1 = r5[r2 + _in_indir]; // put physical block no of block in file sought in r1
if(r1 == 0) { // if no block exists
/*r1 =*/ alloc(); // allocate a new block
r5[r2 + _in_indir] = r1; // put new block number into proper location
// in indirect block
r1 = _indir;
int _new = r5[r2 + _in_indir]; // […] block number of new block
wslot(/*r1*/);
dskwr(/*r1*/); // write newly modified indirect block back out on disk
r1 = _new;
clear(/*r1*/); // [zero out disk/drum block just allocated]
}
return;
}
where we see that reading/writing n bytes from/to file f at offset o is equivalent to
This is fine for writing, and equivalent to just 2. But 1. means the interface from the manual isn't fulfilled (thus, unix doesn't have sparse files before V7) and means this is more-so an implementation detail optimisation that ensures every I/O read completes in 1-3 I/Os instead of m = blocks(o - f.size).
Unless, of course, you always seek to the same offset before reading.
But that runs counter to unix's file-as-bag-of-bytes model,
and imposes a certain structure to the file and makes it behave differently based on access pattern,
which, again, runs counter to documentation and marketing of the time —
DRAFT: The UNIX Time-Sharing System, D. M. Ritchie,
mid-1971
3.1 Ordinary Files
A file contains whatever information the user places there, for example symbolic or binary (object) programs. No particular structuring is expected by the system. […] A few user programs generate and expect files with more structure; […] however, the structure of files is controlled solely by the programs which use them, not by the system.
3.5 System I/O Calls
[…] There is no distinction between "random" and sequential I/O, nor is any logical or physical record size imposed by the system. The size of a file on the disk is determined by the location of the last piece of information written on it; no predetermination of the size of a file is necessary.
— as well as those going forward (cf. The UNIX™ System: Making Computers More Productive, 1982, Bell Laboratories, transcribed in the previous post).
Because the files afflicted by this are valid, just much bigger than they ought to be, the only way you really could notice this is by examining the filesystem directly trying to look precisely for this behaviour, as I have? or, on quiet system, check precisely the interaction of this behaviour and df? Being fixed in V7 agrees with this because it implies to me that this facility had started to be used like we'd use sparse files today — with large nominal file size/filesystem size ratios — on the huge unix ≤V6 install base, someone filled their disk, and the fix is pretty simple.
--- /usr/sys/ux.s.bkp 1972-01-01 00:00:00.000000000 +0000
+++ /usr/sys/ux.s 1972-01-01 00:00:00.000000000 +0000
@@ -66,7 +66,7 @@
sysflg: .=.+1
pptiflg:.=.+1
ttyoch: .=.+1
- .even
+mget0b: .=.+1
.=.+100.; sstack:
buffer: .=.+[ntty*140.]
.=.+[nbuf*520.]
--- /usr/sys/u5.s.bkp 1972-01-01 00:00:00.000000000 +0000
+++ /usr/sys/u5.s 1972-01-01 00:00:00.000000000 +0000
@@ -1,6 +1,7 @@
/ u5 -- unix
mget:
+ mov idev,cdev
mov *u.fofp,mq / file offset in mq
clr ac / later to be high sig
mov $-8,lsh / divide ac/mq by 256.
@@ -13,6 +14,10 @@
mov i.dskp(r2),r1 / r1 has physical block number
bne 2f / if physical block num is zero then need a new block
/ for file
+ clr cdev
+ movb mget0b,r1
+ bne 2f
+ mov idev,cdev
jsr r0,alloc / allocate a new block
mov r1,i.dskp(r2) / physical block number stored in i-node
jsr r0,setimod / set inode modified byte (imod)
@@ -52,6 +57,10 @@
bic $!16,r2
mov i.dskp(r2),r1
bne 2f / if no indirect block exists
+ clr cdev
+ movb mget0b,r1
+ bne 3f
+ mov idev,cdev
jsr r0,alloc / allocate a new block
mov r1,i.dskp(r2) / put block number of new block in i-node
jsr r0,setimod / set i-node modified byte
@@ -65,6 +74,10 @@
mov (r2),r1 / put physical block no of block in file
/ sought in r1
bne 2f / if no block exists
+ clr cdev
+ movb mget0b,r1
+ bne 2f
+ mov idev,cdev
jsr r0,alloc / allocate a new block
mov r1,(r2) / put new block number into proper location in
/ indirect block
@@ -76,6 +89,7 @@
mov (sp),r1 / restore block number of new block
jsr r0,clear / clear new block
2:
+3:
tst (sp)+ / bump stack pointer
rts r0
--- /usr/sys/u6.s.bkp 1972-01-01 00:00:00.000000000 +0000
+++ /usr/sys/u6.s 1972-01-01 00:00:00.000000000 +0000
@@ -83,6 +83,7 @@
jmp error / see 'error' routine
dskr:
+ movb $1,mget0b
mov (sp),r1 / i-number in r1
jsr r0,iget / get i-node (r1) into i-node section of core
mov i.size,r2 / file size in bytes in r2
@@ -210,6 +211,7 @@
jmp error / ?
dskw: / write routine for non-special files
+ clrb mget0b
mov (sp),r1 / get an i-node number from the stack into r1
jsr r0,iget / write i-node out (if modified), read i-node 'r1'
/ into i-node area of core
With the patch, when it encounters a 0 link and mget0b is non-zero, mget will return block mget0b on rf0. Because mget needs to return a block number (on the current device), this is the least intrusive way of implementing this, but it means mget needs to be careful to maintain the notion of the "current device" to mean "the device containing the current i-node" outside of this exceptional return, since the latter will vary when reading a file from the mounted filesystem. dskr sets mget0b to 1 — due to the unique way the V1 unix file system is laid out, block 1 on the rootfs can't usefully be anything except full of zeroes — dskw clears mget0b to get the old behaviour. The procedure for building and installing a thusly-updated kernel in a unix72 environment is outlined in post 023,a. V1 unix I/O buffer count vs. performance benchmark.
# df
718+2309
# cat big large /usr/big /usr/large
@ @@ @@ @@ @@ @@ @@ @@ @# df
718+2309
df and a direct examination confirm the blocks are unchanged by the read (and cdev is restored properly), but are not required because the cat completes significantly faster.
@@ -1,6 +1,7 @@
extern r1, r2, r5, mq;
mget(/*i, u.fofp*/) /*-> r1*/ {
+ cdev = idev;
mq = *u.fofp >> 8; // divide […] by 256.
r2 = mq;
@@ -13,6 +14,11 @@
r1 = i.dskp[r2/2];
if(r1 == 0) { // if physical block num is zero then need a new block for file
+ cdev = 0;
+ if(r1 = mget0b)
+ return;
+ cdev = idev;
+
/*r1 =*/ alloc(); // allocate a new block
i.dskp[r2/2] = r1; // physical block number stored in i-node
setimod(); // [mark i-node modified, update mtim]
@@ -43,6 +49,11 @@
r2 &= 0xF;
r1 = i.dskp[r2/2];
if(r1 == 0) { // if no indirect block exists
+ cdev = 0;
+ if(r1 = mget0b)
+ return;
+ cdev = idev;
+
/*r1 =*/ alloc(); // allocate a new block
i.dskp[r2/2] = r1; // put block number of new [indirect] block in i-node
setimod(); // [mark i-node modified, update mtim]
@@ -52,6 +63,11 @@
int _indir = r1; // save block number of indirect block on stack
r1 = r5[r2 + _in_indir]; // put physical block no of block in file sought in r1
if(r1 == 0) { // if no block exists
+ cdev = 0;
+ if(r1 = mget0b)
+ return;
+ cdev = idev;
+
/*r1 =*/ alloc(); // allocate a new block
r5[r2 + _in_indir] = r1; // put new block number into proper location
// in indirect block
Nit-pick? Correction? Improvement? Annoying? Cute? Anything?
Mail,
post, or open!