/[pkg-reiser]/debian/kernel-patch-2.6-reiser4/reiser4-for-2.6.16-4.patch
ViewVC logotype

Contents of /debian/kernel-patch-2.6-reiser4/reiser4-for-2.6.16-4.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Wed May 31 20:28:04 2006 UTC (6 years, 11 months ago) by cavok
Branch: source-dist, MAIN
CVS Tags: upstream_version_2_6_16-4, HEAD
Changes since 1.1: +0 -0 lines
Imported upstream version 2.6.16-4. 
1
2 reiser4 for 2.6.16
3
4 Documentation/Changes | 12
5 Documentation/filesystems/reiser4.txt | 75
6 fs/Kconfig | 2
7 fs/Makefile | 1
8 fs/fs-writeback.c | 26
9 fs/reiser4/Kconfig | 31
10 fs/reiser4/Makefile | 100
11 fs/reiser4/README | 125
12 fs/reiser4/carry_ops.h | 42
13 fs/reiser4/dformat.h | 71
14 fs/reiser4/dscale.h | 27
15 fs/reiser4/entd.h | 90
16 fs/reiser4/estimate.c | 111
17 fs/reiser4/ioctl.h | 41
18 fs/reiser4/kassign.h | 110
19 fs/reiser4/key.c | 137
20 fs/reiser4/ktxnmgrd.h | 52
21 fs/reiser4/oid.c | 141
22 fs/reiser4/page_cache.h | 62
23 fs/reiser4/plugin/Makefile | 26
24 fs/reiser4/plugin/cluster.c | 66
25 fs/reiser4/plugin/compress/Makefile | 6
26 fs/reiser4/plugin/compress/compress.h | 38
27 fs/reiser4/plugin/compress/minilzo.h | 94
28 fs/reiser4/plugin/crypto/cipher.c | 116
29 fs/reiser4/plugin/crypto/cipher.h | 67
30 fs/reiser4/plugin/crypto/digest.c | 58
31 fs/reiser4/plugin/dir/Makefile | 5
32 fs/reiser4/plugin/dir/dir.h | 36
33 fs/reiser4/plugin/dir/hashed_dir.c | 81
34 fs/reiser4/plugin/dir/seekable_dir.c | 46
35 fs/reiser4/plugin/disk_format/Makefile | 5
36 fs/reiser4/plugin/disk_format/disk_format.c | 37
37 fs/reiser4/plugin/disk_format/disk_format.h | 27
38 fs/reiser4/plugin/disk_format/disk_format40.h | 99
39 fs/reiser4/plugin/fibration.h | 37
40 fs/reiser4/plugin/file/Makefile | 7
41 fs/reiser4/plugin/file/symfile.c | 87
42 fs/reiser4/plugin/file/symlink.c | 92
43 fs/reiser4/plugin/item/Makefile | 18
44 fs/reiser4/plugin/item/acl.h | 66
45 fs/reiser4/plugin/item/blackbox.c | 142
46 fs/reiser4/plugin/item/blackbox.h | 33
47 fs/reiser4/plugin/item/cde.h | 87
48 fs/reiser4/plugin/item/ctail.h | 89
49 fs/reiser4/plugin/item/internal.h | 57
50 fs/reiser4/plugin/item/sde.h | 66
51 fs/reiser4/plugin/item/tail.h | 58
52 fs/reiser4/plugin/node/Makefile | 5
53 fs/reiser4/plugin/node/node.c | 131
54 fs/reiser4/plugin/node/node40.h | 125
55 fs/reiser4/plugin/object.h | 121
56 fs/reiser4/plugin/plugin_header.h | 136
57 fs/reiser4/plugin/plugin_set.h | 83
58 fs/reiser4/plugin/regular.c | 44
59 fs/reiser4/plugin/security/Makefile | 4
60 fs/reiser4/plugin/security/perm.c | 44
61 fs/reiser4/plugin/security/perm.h | 82
62 fs/reiser4/plugin/space/Makefile | 4
63 fs/reiser4/plugin/space/bitmap.h | 47
64 fs/reiser4/plugin/space/space_allocator.h | 80
65 fs/reiser4/plugin/tail_policy.c | 113
66 fs/reiser4/pool.h | 54
67 fs/reiser4/readahead.c | 138
68 fs/reiser4/readahead.h | 48
69 fs/reiser4/safe_link.h | 29
70 fs/reiser4/seal.h | 49
71 fs/reiser4/status_flags.h | 43
72 fs/reiser4/tap.h | 69
73 fs/reiser4/tree_mod.h | 29
74 fs/reiser4/tree_walk.h | 125
75 fs/reiser4/vfs_ops.h | 58
76 fs/reiser4/wander.h | 135
77 fs/reiser4/writeout.h | 21
78 include/linux/fs.h | 3
79 lib/radix-tree.c | 1
80 mm/filemap.c | 6
81 mm/page-writeback.c | 2
82 mm/readahead.c | 1
83
84 diff -puN mm/readahead.c~reiser4-export-handle_ra_miss mm/readahead.c
85
86
87 Documentation/Changes | 12
88 Documentation/filesystems/reiser4.txt | 75
89 fs/Kconfig | 2
90 fs/Makefile | 1
91 fs/fs-writeback.c | 26
92 fs/reiser4/Kconfig | 31
93 fs/reiser4/Makefile | 100
94 fs/reiser4/README | 125
95 fs/reiser4/as_ops.c | 389 ++
96 fs/reiser4/block_alloc.c | 1139 +++++++
97 fs/reiser4/block_alloc.h | 175 +
98 fs/reiser4/blocknrset.c | 368 ++
99 fs/reiser4/carry.c | 1381 +++++++++
100 fs/reiser4/carry.h | 442 +++
101 fs/reiser4/carry_ops.c | 2103 ++++++++++++++
102 fs/reiser4/carry_ops.h | 42
103 fs/reiser4/context.c | 278 +
104 fs/reiser4/context.h | 228 +
105 fs/reiser4/coord.c | 937 ++++++
106 fs/reiser4/coord.h | 389 ++
107 fs/reiser4/debug.c | 300 ++
108 fs/reiser4/debug.h | 350 ++
109 fs/reiser4/dformat.h | 71
110 fs/reiser4/dscale.c | 174 +
111 fs/reiser4/dscale.h | 27
112 fs/reiser4/entd.c | 356 ++
113 fs/reiser4/entd.h | 90
114 fs/reiser4/eottl.c | 510 +++
115 fs/reiser4/estimate.c | 111
116 fs/reiser4/export_ops.c | 296 ++
117 fs/reiser4/flush.c | 3626 ++++++++++++++++++++++++
118 fs/reiser4/flush.h | 274 +
119 fs/reiser4/flush_queue.c | 681 ++++
120 fs/reiser4/forward.h | 258 +
121 fs/reiser4/fsdata.c | 803 +++++
122 fs/reiser4/fsdata.h | 218 +
123 fs/reiser4/init_super.c | 739 +++++
124 fs/reiser4/inode.c | 727 ++++
125 fs/reiser4/inode.h | 430 ++
126 fs/reiser4/ioctl.h | 41
127 fs/reiser4/jnode.c | 1921 +++++++++++++
128 fs/reiser4/jnode.h | 711 ++++
129 fs/reiser4/kassign.c | 659 ++++
130 fs/reiser4/kassign.h | 110
131 fs/reiser4/key.c | 137
132 fs/reiser4/key.h | 384 ++
133 fs/reiser4/ktxnmgrd.c | 214 +
134 fs/reiser4/ktxnmgrd.h | 52
135 fs/reiser4/lock.c | 1261 ++++++++
136 fs/reiser4/lock.h | 272 +
137 fs/reiser4/oid.c | 141
138 fs/reiser4/page_cache.c | 712 ++++
139 fs/reiser4/page_cache.h | 62
140 fs/reiser4/plugin/Makefile | 26
141 fs/reiser4/plugin/cluster.c | 66
142 fs/reiser4/plugin/cluster.h | 316 ++
143 fs/reiser4/plugin/compress/Makefile | 6
144 fs/reiser4/plugin/compress/compress.c | 370 ++
145 fs/reiser4/plugin/compress/compress.h | 38
146 fs/reiser4/plugin/compress/compress_mode.c | 163 +
147 fs/reiser4/plugin/compress/lzoconf.h | 420 ++
148 fs/reiser4/plugin/compress/minilzo.c | 2155 ++++++++++++++
149 fs/reiser4/plugin/compress/minilzo.h | 94
150 fs/reiser4/plugin/crypto/cipher.c | 116
151 fs/reiser4/plugin/crypto/cipher.h | 67
152 fs/reiser4/plugin/crypto/digest.c | 58
153 fs/reiser4/plugin/dir/Makefile | 5
154 fs/reiser4/plugin/dir/dir.h | 36
155 fs/reiser4/plugin/dir/hashed_dir.c | 81
156 fs/reiser4/plugin/dir/seekable_dir.c | 46
157 fs/reiser4/plugin/dir_plugin_common.c | 864 +++++
158 fs/reiser4/plugin/disk_format/Makefile | 5
159 fs/reiser4/plugin/disk_format/disk_format.c | 37
160 fs/reiser4/plugin/disk_format/disk_format.h | 27
161 fs/reiser4/plugin/disk_format/disk_format40.c | 556 +++
162 fs/reiser4/plugin/disk_format/disk_format40.h | 99
163 fs/reiser4/plugin/fibration.c | 174 +
164 fs/reiser4/plugin/fibration.h | 37
165 fs/reiser4/plugin/file/Makefile | 7
166 fs/reiser4/plugin/file/cryptcompress.c | 3817 ++++++++++++++++++++++++++
167 fs/reiser4/plugin/file/cryptcompress.h | 551 +++
168 fs/reiser4/plugin/file/file.c | 2705 ++++++++++++++++++
169 fs/reiser4/plugin/file/file.h | 257 +
170 fs/reiser4/plugin/file/invert.c | 493 +++
171 fs/reiser4/plugin/file/symfile.c | 87
172 fs/reiser4/plugin/file/symlink.c | 92
173 fs/reiser4/plugin/file/tail_conversion.c | 728 ++++
174 fs/reiser4/plugin/file_ops.c | 167 +
175 fs/reiser4/plugin/file_ops_readdir.c | 654 ++++
176 fs/reiser4/plugin/file_plugin_common.c | 929 ++++++
177 fs/reiser4/plugin/hash.c | 350 ++
178 fs/reiser4/plugin/inode_ops.c | 886 ++++++
179 fs/reiser4/plugin/inode_ops_rename.c | 904 ++++++
180 fs/reiser4/plugin/item/Makefile | 18
181 fs/reiser4/plugin/item/acl.h | 66
182 fs/reiser4/plugin/item/blackbox.c | 142
183 fs/reiser4/plugin/item/blackbox.h | 33
184 fs/reiser4/plugin/item/cde.c | 1007 ++++++
185 fs/reiser4/plugin/item/cde.h | 87
186 fs/reiser4/plugin/item/ctail.c | 1588 ++++++++++
187 fs/reiser4/plugin/item/ctail.h | 89
188 fs/reiser4/plugin/item/extent.c | 197 +
189 fs/reiser4/plugin/item/extent.h | 228 +
190 fs/reiser4/plugin/item/extent_file_ops.c | 1712 +++++++++++
191 fs/reiser4/plugin/item/extent_flush_ops.c | 1018 ++++++
192 fs/reiser4/plugin/item/extent_item_ops.c | 882 ++++++
193 fs/reiser4/plugin/item/internal.c | 392 ++
194 fs/reiser4/plugin/item/internal.h | 57
195 fs/reiser4/plugin/item/item.c | 727 ++++
196 fs/reiser4/plugin/item/item.h | 399 ++
197 fs/reiser4/plugin/item/sde.c | 190 +
198 fs/reiser4/plugin/item/sde.h | 66
199 fs/reiser4/plugin/item/static_stat.c | 1040 +++++++
200 fs/reiser4/plugin/item/static_stat.h | 219 +
201 fs/reiser4/plugin/item/tail.c | 805 +++++
202 fs/reiser4/plugin/item/tail.h | 58
203 fs/reiser4/plugin/node/Makefile | 5
204 fs/reiser4/plugin/node/node.c | 131
205 fs/reiser4/plugin/node/node.h | 272 +
206 fs/reiser4/plugin/node/node40.c | 2924 +++++++++++++++++++
207 fs/reiser4/plugin/node/node40.h | 125
208 fs/reiser4/plugin/object.c | 501 +++
209 fs/reiser4/plugin/object.h | 121
210 fs/reiser4/plugin/plugin.c | 533 +++
211 fs/reiser4/plugin/plugin.h | 936 ++++++
212 fs/reiser4/plugin/plugin_header.h | 136
213 fs/reiser4/plugin/plugin_set.c | 378 ++
214 fs/reiser4/plugin/plugin_set.h | 83
215 fs/reiser4/plugin/regular.c | 44
216 fs/reiser4/plugin/security/Makefile | 4
217 fs/reiser4/plugin/security/perm.c | 44
218 fs/reiser4/plugin/security/perm.h | 82
219 fs/reiser4/plugin/space/Makefile | 4
220 fs/reiser4/plugin/space/bitmap.c | 1592 ++++++++++
221 fs/reiser4/plugin/space/bitmap.h | 47
222 fs/reiser4/plugin/space/space_allocator.h | 80
223 fs/reiser4/plugin/tail_policy.c | 113
224 fs/reiser4/pool.c | 236 +
225 fs/reiser4/pool.h | 54
226 fs/reiser4/readahead.c | 138
227 fs/reiser4/readahead.h | 48
228 fs/reiser4/reiser4.h | 276 +
229 fs/reiser4/safe_link.c | 351 ++
230 fs/reiser4/safe_link.h | 29
231 fs/reiser4/seal.c | 217 +
232 fs/reiser4/seal.h | 49
233 fs/reiser4/search.c | 1611 ++++++++++
234 fs/reiser4/status_flags.c | 176 +
235 fs/reiser4/status_flags.h | 43
236 fs/reiser4/super.c | 313 ++
237 fs/reiser4/super.h | 468 +++
238 fs/reiser4/super_ops.c | 721 ++++
239 fs/reiser4/tap.c | 377 ++
240 fs/reiser4/tap.h | 69
241 fs/reiser4/tree.c | 1875 ++++++++++++
242 fs/reiser4/tree.h | 579 +++
243 fs/reiser4/tree_mod.c | 383 ++
244 fs/reiser4/tree_mod.h | 29
245 fs/reiser4/tree_walk.c | 926 ++++++
246 fs/reiser4/tree_walk.h | 125
247 fs/reiser4/txnmgr.c | 3158 +++++++++++++++++++++
248 fs/reiser4/txnmgr.h | 704 ++++
249 fs/reiser4/type_safe_hash.h | 320 ++
250 fs/reiser4/vfs_ops.c | 267 +
251 fs/reiser4/vfs_ops.h | 58
252 fs/reiser4/wander.c | 1799 ++++++++++++
253 fs/reiser4/wander.h | 135
254 fs/reiser4/writeout.h | 21
255 fs/reiser4/znode.c | 1028 +++++++
256 fs/reiser4/znode.h | 434 ++
257 include/linux/fs.h | 3
258 lib/radix-tree.c | 1
259 mm/filemap.c | 6
260 mm/page-writeback.c | 3
261 mm/readahead.c | 2
262 175 files changed, 79645 insertions(+), 12 deletions(-)
263
264 diff -puN Documentation/Changes~reiser4-for-2.6.16-3 Documentation/Changes
265 --- linux-2.6.16-3/Documentation/Changes~reiser4-for-2.6.16-3 2006-05-30 18:51:49.928551250 +0400
266 +++ linux-2.6.16-3-vs/Documentation/Changes 2006-05-30 18:51:50.052559000 +0400
267 @@ -54,6 +54,7 @@ o module-init-tools 0.9.10
268 o e2fsprogs 1.29 # tune2fs
269 o jfsutils 1.1.3 # fsck.jfs -V
270 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
271 +o reiser4progs 1.0.0 # fsck.reiser4 -V
272 o xfsprogs 2.6.0 # xfs_db -V
273 o pcmciautils 004
274 o pcmcia-cs 3.1.21 # cardmgr -V
275 @@ -163,6 +164,13 @@ The reiserfsprogs package should be used
276 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
277 reiserfsck. These utils work on both i386 and alpha platforms.
278
279 +Reiser4progs
280 +------------
281 +
282 +The reiser4progs package contains utilities for the reiser4 file system.
283 +Detailed instructions are provided in the README file located at:
284 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
285 +
286 Xfsprogs
287 --------
288
289 @@ -344,6 +352,10 @@ Reiserfsprogs
290 -------------
291 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
292
293 +Reiser4progs
294 +------------
295 +o <ftp://ftp.namesys.com/pub/reiser4progs/>
296 +
297 Xfsprogs
298 --------
299 o <ftp://oss.sgi.com/projects/xfs/download/>
300 diff -puN /dev/null Documentation/filesystems/reiser4.txt
301 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
302 +++ linux-2.6.16-3-vs/Documentation/filesystems/reiser4.txt 2006-05-30 18:51:50.056559250 +0400
303 @@ -0,0 +1,75 @@
304 +Reiser4 filesystem
305 +==================
306 +Reiser4 is a file system based on dancing tree algorithms, and is
307 +described at http://www.namesys.com
308 +
309 +
310 +References
311 +==========
312 +web page http://namesys.com/v4/v4.html
313 +source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
314 +userland tools ftp://ftp.namesys.com/pub/reiser4progs/
315 +install page http://www.namesys.com/install_v4.html
316 +
317 +Compile options
318 +===============
319 +Enable reiser4 debug mode
320 + This checks everything imaginable while reiser4
321 + runs
322 +
323 +Mount options
324 +=============
325 +tmgr.atom_max_size=N
326 + Atoms containing more than N blocks will be forced to commit.
327 + N is decimal.
328 + Default is nr_free_pagecache_pages() / 2 at mount time.
329 +
330 +tmgr.atom_max_age=N
331 + Atoms older than N seconds will be forced to commit. N is decimal.
332 + Default is 600.
333 +
334 +tmgr.atom_max_flushers=N
335 + Limit of concurrent flushers for one atom. 0 means no limit.
336 + Default is 0.
337 +
338 +tree.cbk_cache.nr_slots=N
339 + Number of slots in the cbk cache.
340 +
341 +flush.relocate_threshold=N
342 + If flush finds more than N adjacent dirty leaf-level blocks it
343 + will force them to be relocated.
344 + Default is 64.
345 +
346 +flush.relocate_distance=N
347 + If flush finds can find a block allocation closer than at most
348 + N from the preceder it will relocate to that position.
349 + Default is 64.
350 +
351 +flush.scan_maxnodes=N
352 + The maximum number of nodes to scan left on a level during
353 + flush.
354 + Default is 10000.
355 +
356 +optimal_io_size=N
357 + Preferred IO size. This value is used to set st_blksize of
358 + struct stat.
359 + Default is 65536.
360 +
361 +bsdgroups
362 + Turn on BSD-style gid assignment.
363 +
364 +32bittimes
365 + By default file in reiser4 have 64 bit timestamps. Files
366 + created when filesystem is mounted with 32bittimes mount
367 + option will get 32 bit timestamps.
368 +
369 +mtflush
370 + Turn off concurrent flushing.
371 +
372 +nopseudo
373 + Disable pseudo files support. See
374 + http://namesys.com/v4/pseudo.html for more about pseudo files.
375 +
376 +dont_load_bitmap
377 + Don't load all bitmap blocks at mount time, it is useful for
378 + machines with tiny RAM and large disks.
379 diff -puN fs/Kconfig~reiser4-for-2.6.16-3 fs/Kconfig
380 --- linux-2.6.16-3/fs/Kconfig~reiser4-for-2.6.16-3 2006-05-30 18:51:49.936551750 +0400
381 +++ linux-2.6.16-3-vs/fs/Kconfig 2006-05-30 18:51:50.060559500 +0400
382 @@ -177,6 +177,8 @@ config FS_MBCACHE
383 default y if EXT2_FS=y || EXT3_FS=y
384 default m if EXT2_FS=m || EXT3_FS=m
385
386 +source "fs/reiser4/Kconfig"
387 +
388 config REISERFS_FS
389 tristate "Reiserfs support"
390 help
391 diff -puN fs/Makefile~reiser4-for-2.6.16-3 fs/Makefile
392 --- linux-2.6.16-3/fs/Makefile~reiser4-for-2.6.16-3 2006-05-30 18:51:49.936551750 +0400
393 +++ linux-2.6.16-3-vs/fs/Makefile 2006-05-30 18:51:50.064559750 +0400
394 @@ -51,6 +51,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
395
396 # Do not add any filesystems before this line
397 obj-$(CONFIG_REISERFS_FS) += reiserfs/
398 +obj-$(CONFIG_REISER4_FS) += reiser4/
399 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
400 obj-$(CONFIG_JBD) += jbd/
401 obj-$(CONFIG_EXT2_FS) += ext2/
402 diff -puN fs/fs-writeback.c~reiser4-for-2.6.16-3 fs/fs-writeback.c
403 --- linux-2.6.16-3/fs/fs-writeback.c~reiser4-for-2.6.16-3 2006-05-30 18:51:49.940552000 +0400
404 +++ linux-2.6.16-3-vs/fs/fs-writeback.c 2006-05-30 18:51:50.028557500 +0400
405 @@ -286,8 +286,6 @@ __writeback_single_inode(struct inode *i
406 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
407 * that it can be located for waiting on in __writeback_single_inode().
408 *
409 - * Called under inode_lock.
410 - *
411 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
412 * This function assumes that the blockdev superblock's inodes are backed by
413 * a variety of queues, so all inodes are searched. For other superblocks,
414 @@ -303,11 +301,13 @@ __writeback_single_inode(struct inode *i
415 * on the writer throttling path, and we get decent balancing between many
416 * throttled threads: we don't want them all piling up on __wait_on_inode.
417 */
418 -static void
419 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
420 +void
421 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
422 {
423 const unsigned long start = jiffies; /* livelock avoidance */
424
425 + spin_lock(&inode_lock);
426 +
427 if (!wbc->for_kupdate || list_empty(&sb->s_io))
428 list_splice_init(&sb->s_dirty, &sb->s_io);
429
430 @@ -387,8 +387,19 @@ sync_sb_inodes(struct super_block *sb, s
431 if (wbc->nr_to_write <= 0)
432 break;
433 }
434 + spin_unlock(&inode_lock);
435 return; /* Leave any unwritten inodes on s_io */
436 }
437 +EXPORT_SYMBOL(generic_sync_sb_inodes);
438 +
439 +static void
440 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
441 +{
442 + if (sb->s_op->sync_inodes)
443 + sb->s_op->sync_inodes(sb, wbc);
444 + else
445 + generic_sync_sb_inodes(sb, wbc);
446 +}
447
448 /*
449 * Start writeback of dirty pagecache data against all unlocked inodes.
450 @@ -429,11 +440,8 @@ restart:
451 * be unmounted by the time it is released.
452 */
453 if (down_read_trylock(&sb->s_umount)) {
454 - if (sb->s_root) {
455 - spin_lock(&inode_lock);
456 + if (sb->s_root)
457 sync_sb_inodes(sb, wbc);
458 - spin_unlock(&inode_lock);
459 - }
460 up_read(&sb->s_umount);
461 }
462 spin_lock(&sb_lock);
463 @@ -469,9 +477,7 @@ void sync_inodes_sb(struct super_block *
464 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
465 nr_dirty + nr_unstable;
466 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
467 - spin_lock(&inode_lock);
468 sync_sb_inodes(sb, &wbc);
469 - spin_unlock(&inode_lock);
470 }
471
472 /*
473 diff -puN /dev/null fs/reiser4/Kconfig
474 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
475 +++ linux-2.6.16-3-vs/fs/reiser4/Kconfig 2006-05-30 18:51:50.064559750 +0400
476 @@ -0,0 +1,31 @@
477 +config REISER4_FS
478 + tristate "Reiser4 (EXPERIMENTAL)"
479 + depends on EXPERIMENTAL
480 + select ZLIB_INFLATE
481 + select ZLIB_DEFLATE
482 + help
483 + Reiser4 is a filesystem that performs all filesystem operations
484 + as atomic transactions, which means that it either performs a
485 + write, or it does not, and in the event of a crash it does not
486 + partially perform it or corrupt it.
487 +
488 + It stores files in dancing trees, which are like balanced trees but
489 + faster. It packs small files together so that they share blocks
490 + without wasting space. This means you can use it to store really
491 + small files. It also means that it saves you disk space. It avoids
492 + hassling you with anachronisms like having a maximum number of
493 + inodes, and wasting space if you use less than that number.
494 +
495 + Reiser4 is a distinct filesystem type from reiserfs (V3).
496 + It's therefore not possible to use reiserfs file systems
497 + with reiser4.
498 +
499 + To learn more about reiser4, go to http://www.namesys.com
500 +
501 +config REISER4_DEBUG
502 + bool "Enable reiser4 debug mode"
503 + depends on REISER4_FS
504 + help
505 + Don't use this unless you are debugging reiser4.
506 +
507 + If unsure, say N.
508 diff -puN /dev/null fs/reiser4/Makefile
509 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
510 +++ linux-2.6.16-3-vs/fs/reiser4/Makefile 2006-05-30 18:51:50.068560000 +0400
511 @@ -0,0 +1,100 @@
512 +#
513 +# reiser4/Makefile
514 +#
515 +
516 +obj-$(CONFIG_REISER4_FS) += reiser4.o
517 +
518 +reiser4-y := \
519 + debug.o \
520 + jnode.o \
521 + znode.o \
522 + key.o \
523 + pool.o \
524 + tree_mod.o \
525 + estimate.o \
526 + carry.o \
527 + carry_ops.o \
528 + lock.o \
529 + tree.o \
530 + context.o \
531 + tap.o \
532 + coord.o \
533 + block_alloc.o \
534 + txnmgr.o \
535 + kassign.o \
536 + flush.o \
537 + wander.o \
538 + eottl.o \
539 + search.o \
540 + page_cache.o \
541 + seal.o \
542 + dscale.o \
543 + flush_queue.o \
544 + ktxnmgrd.o \
545 + blocknrset.o \
546 + super.o \
547 + super_ops.o \
548 + fsdata.o \
549 + export_ops.o \
550 + oid.o \
551 + tree_walk.o \
552 + inode.o \
553 + vfs_ops.o \
554 + as_ops.o \
555 + entd.o\
556 + readahead.o \
557 + status_flags.o \
558 + init_super.o \
559 + safe_link.o \
560 + \
561 + plugin/plugin.o \
562 + plugin/plugin_set.o \
563 + plugin/node/node.o \
564 + plugin/object.o \
565 + plugin/cluster.o \
566 + plugin/inode_ops.o \
567 + plugin/inode_ops_rename.o \
568 + plugin/file_ops.o \
569 + plugin/file_ops_readdir.o \
570 + plugin/file_plugin_common.o \
571 + plugin/file/file.o \
572 + plugin/file/tail_conversion.o \
573 + plugin/file/symlink.o \
574 + plugin/file/cryptcompress.o \
575 + plugin/dir_plugin_common.o \
576 + plugin/dir/hashed_dir.o \
577 + plugin/dir/seekable_dir.o \
578 + plugin/node/node40.o \
579 + \
580 + plugin/crypto/cipher.o \
581 + plugin/crypto/digest.o \
582 + \
583 + plugin/compress/minilzo.o \
584 + plugin/compress/compress.o \
585 + plugin/compress/compress_mode.o \
586 + \
587 + plugin/item/static_stat.o \
588 + plugin/item/sde.o \
589 + plugin/item/cde.o \
590 + plugin/item/blackbox.o \
591 + plugin/item/internal.o \
592 + plugin/item/tail.o \
593 + plugin/item/ctail.o \
594 + plugin/item/extent.o \
595 + plugin/item/extent_item_ops.o \
596 + plugin/item/extent_file_ops.o \
597 + plugin/item/extent_flush_ops.o \
598 + \
599 + plugin/hash.o \
600 + plugin/fibration.o \
601 + plugin/tail_policy.o \
602 + plugin/item/item.o \
603 + \
604 + plugin/security/perm.o \
605 + plugin/space/bitmap.o \
606 + \
607 + plugin/disk_format/disk_format40.o \
608 + plugin/disk_format/disk_format.o \
609 + \
610 + plugin/regular.o
611 +
612 diff -puN /dev/null fs/reiser4/README
613 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
614 +++ linux-2.6.16-3-vs/fs/reiser4/README 2006-05-30 18:51:50.068560000 +0400
615 @@ -0,0 +1,125 @@
616 +[LICENSING]
617 +
618 +Reiser4 is hereby licensed under the GNU General
619 +Public License version 2.
620 +
621 +Source code files that contain the phrase "licensing governed by
622 +reiser4/README" are "governed files" throughout this file. Governed
623 +files are licensed under the GPL. The portions of them owned by Hans
624 +Reiser, or authorized to be licensed by him, have been in the past,
625 +and likely will be in the future, licensed to other parties under
626 +other licenses. If you add your code to governed files, and don't
627 +want it to be owned by Hans Reiser, put your copyright label on that
628 +code so the poor blight and his customers can keep things straight.
629 +All portions of governed files not labeled otherwise are owned by Hans
630 +Reiser, and by adding your code to it, widely distributing it to
631 +others or sending us a patch, and leaving the sentence in stating that
632 +licensing is governed by the statement in this file, you accept this.
633 +It will be a kindness if you identify whether Hans Reiser is allowed
634 +to license code labeled as owned by you on your behalf other than
635 +under the GPL, because he wants to know if it is okay to do so and put
636 +a check in the mail to you (for non-trivial improvements) when he
637 +makes his next sale. He makes no guarantees as to the amount if any,
638 +though he feels motivated to motivate contributors, and you can surely
639 +discuss this with him before or after contributing. You have the
640 +right to decline to allow him to license your code contribution other
641 +than under the GPL.
642 +
643 +Further licensing options are available for commercial and/or other
644 +interests directly from Hans Reiser: reiser@namesys.com. If you interpret
645 +the GPL as not allowing those additional licensing options, you read
646 +it wrongly, and Richard Stallman agrees with me, when carefully read
647 +you can see that those restrictions on additional terms do not apply
648 +to the owner of the copyright, and my interpretation of this shall
649 +govern for this license.
650 +
651 +[END LICENSING]
652 +
653 +Reiser4 is a file system based on dancing tree algorithms, and is
654 +described at http://www.namesys.com
655 +
656 +mkfs.reiser4 and other utilities are on our webpage or wherever your
657 +Linux provider put them. You really want to be running the latest
658 +version off the website if you use fsck.
659 +
660 +Yes, if you update your reiser4 kernel module you do have to
661 +recompile your kernel, most of the time. The errors you get will be
662 +quite cryptic if your forget to do so.
663 +
664 +Hideous Commercial Pitch: Spread your development costs across other OS
665 +vendors. Select from the best in the world, not the best in your
666 +building, by buying from third party OS component suppliers. Leverage
667 +the software component development power of the internet. Be the most
668 +aggressive in taking advantage of the commercial possibilities of
669 +decentralized internet development, and add value through your branded
670 +integration that you sell as an operating system. Let your competitors
671 +be the ones to compete against the entire internet by themselves. Be
672 +hip, get with the new economic trend, before your competitors do. Send
673 +email to reiser@namesys.com
674 +
675 +Hans Reiser was the primary architect of Reiser4, but a whole team
676 +chipped their ideas in. He invested everything he had into Namesys
677 +for 5.5 dark years of no money before Reiser3 finally started to work well
678 +enough to bring in money. He owns the copyright.
679 +
680 +DARPA was the primary sponsor of Reiser4. DARPA does not endorse
681 +Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
682 +opinion, unique in its willingness to invest into things more
683 +theoretical than the VC community can readily understand, and more
684 +longterm than allows them to be sure that they will be the ones to
685 +extract the economic benefits from. DARPA also integrated us into a
686 +security community that transformed our security worldview.
687 +
688 +Vladimir Saveliev is our lead programmer, with us from the beginning,
689 +and he worked long hours writing the cleanest code. This is why he is
690 +now the lead programmer after years of commitment to our work. He
691 +always made the effort to be the best he could be, and to make his
692 +code the best that it could be. What resulted was quite remarkable. I
693 +don't think that money can ever motivate someone to work the way he
694 +did, he is one of the most selfless men I know.
695 +
696 +Alexander Lyamin was our sysadmin, and helped to educate us in
697 +security issues. Moscow State University and IMT were very generous
698 +in the internet access they provided us, and in lots of other little
699 +ways that a generous institution can be.
700 +
701 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
702 +locking code, the block allocator, and finished the flushing code.
703 +His code is always crystal clean and well structured.
704 +
705 +Nikita Danilov wrote the core of the balancing code, the core of the
706 +plugins code, and the directory code. He worked a steady pace of long
707 +hours that produced a whole lot of well abstracted code. He is our
708 +senior computer scientist.
709 +
710 +Vladimir Demidov wrote the parser. Writing an in kernel parser is
711 +something very few persons have the skills for, and it is thanks to
712 +him that we can say that the parser is really not so big compared to
713 +various bits of our other code, and making a parser work in the kernel
714 +was not so complicated as everyone would imagine mainly because it was
715 +him doing it...
716 +
717 +Joshua McDonald wrote the transaction manager, and the flush code.
718 +The flush code unexpectedly turned out be extremely hairy for reasons
719 +you can read about on our web page, and he did a great job on an
720 +extremely difficult task.
721 +
722 +Nina Reiser handled our accounting, government relations, and much
723 +more.
724 +
725 +Ramon Reiser developed our website.
726 +
727 +Beverly Palmer drew our graphics.
728 +
729 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
730 +and worked with Umka on developing libreiser4 and userspace plugins.
731 +
732 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
733 +userspace tools (reiser4progs).
734 +
735 +Oleg Drokin (aka Green) is the release manager who fixes everything.
736 +It is so nice to have someone like that on the team. He (plus Chris
737 +and Jeff) make it possible for the entire rest of the Namesys team to
738 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
739 +is just amazing to watch his talent for spotting bugs in action.
740 +
741 diff -puN /dev/null fs/reiser4/as_ops.c
742 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
743 +++ linux-2.6.16-3-vs/fs/reiser4/as_ops.c 2006-05-30 18:51:50.068560000 +0400
744 @@ -0,0 +1,389 @@
745 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
746 +
747 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
748 +
749 +#include "forward.h"
750 +#include "debug.h"
751 +#include "dformat.h"
752 +#include "coord.h"
753 +#include "plugin/item/item.h"
754 +#include "plugin/file/file.h"
755 +#include "plugin/security/perm.h"
756 +#include "plugin/disk_format/disk_format.h"
757 +#include "plugin/plugin.h"
758 +#include "plugin/plugin_set.h"
759 +#include "plugin/object.h"
760 +#include "txnmgr.h"
761 +#include "jnode.h"
762 +#include "znode.h"
763 +#include "block_alloc.h"
764 +#include "tree.h"
765 +#include "vfs_ops.h"
766 +#include "inode.h"
767 +#include "page_cache.h"
768 +#include "ktxnmgrd.h"
769 +#include "super.h"
770 +#include "reiser4.h"
771 +#include "entd.h"
772 +
773 +#include <linux/profile.h>
774 +#include <linux/types.h>
775 +#include <linux/mount.h>
776 +#include <linux/vfs.h>
777 +#include <linux/mm.h>
778 +#include <linux/buffer_head.h>
779 +#include <linux/dcache.h>
780 +#include <linux/list.h>
781 +#include <linux/pagemap.h>
782 +#include <linux/slab.h>
783 +#include <linux/seq_file.h>
784 +#include <linux/init.h>
785 +#include <linux/module.h>
786 +#include <linux/writeback.h>
787 +#include <linux/backing-dev.h>
788 +#include <linux/quotaops.h>
789 +#include <linux/security.h>
790 +
791 +/* address space operations */
792 +
793 +/**
794 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
795 + * @page: page to be dirtied
796 + *
797 + * Operation of struct address_space_operations. This implementation is used by
798 + * unix and crc file plugins.
799 + *
800 + * This is called when reiser4 page gets dirtied outside of reiser4, for
801 + * example, when dirty bit is moved from pte to physical page.
802 + *
803 + * Tags page in the mapping's page tree with special tag so that it is possible
804 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
805 + * capturing by an atom) later because it can not be done in the contexts where
806 + * set_page_dirty is called.
807 + */
808 +int reiser4_set_page_dirty(struct page *page)
809 +{
810 + /* this page can be unformatted only */
811 + assert("vs-1734", (page->mapping &&
812 + page->mapping->host &&
813 + get_super_fake(page->mapping->host->i_sb) !=
814 + page->mapping->host
815 + && get_cc_fake(page->mapping->host->i_sb) !=
816 + page->mapping->host
817 + && get_bitmap_fake(page->mapping->host->i_sb) !=
818 + page->mapping->host));
819 +
820 + if (!TestSetPageDirty(page)) {
821 + struct address_space *mapping = page->mapping;
822 +
823 + if (mapping) {
824 + write_lock_irq(&mapping->tree_lock);
825 +
826 + /* check for race with truncate */
827 + if (page->mapping) {
828 + assert("vs-1652", page->mapping == mapping);
829 + if (mapping_cap_account_dirty(mapping))
830 + inc_page_state(nr_dirty);
831 + radix_tree_tag_set(&mapping->page_tree,
832 + page->index,
833 + PAGECACHE_TAG_REISER4_MOVED);
834 + }
835 + write_unlock_irq(&mapping->tree_lock);
836 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
837 + }
838 + }
839 + return 0;
840 +}
841 +
842 +static int filler(void *vp, struct page *page)
843 +{
844 + return page->mapping->a_ops->readpage(vp, page);
845 +}
846 +
847 +/**
848 + * reiser4_readpages - submit read for a set of pages
849 + * @file: file to read
850 + * @mapping: address space
851 + * @pages: list of pages to submit read for
852 + * @nr_pages: number of pages no the list
853 + *
854 + * Operation of struct address_space_operations. This implementation is used by
855 + * unix and crc file plugins.
856 + *
857 + * Calls read_cache_pages or readpages hook if it is set.
858 + */
859 +int
860 +reiser4_readpages(struct file *file, struct address_space *mapping,
861 + struct list_head *pages, unsigned nr_pages)
862 +{
863 + reiser4_context *ctx;
864 + reiser4_file_fsdata *fsdata;
865 +
866 + ctx = init_context(mapping->host->i_sb);
867 + if (IS_ERR(ctx))
868 + return PTR_ERR(ctx);
869 +
870 + fsdata = reiser4_get_file_fsdata(file);
871 + if (IS_ERR(fsdata)) {
872 + reiser4_exit_context(ctx);
873 + return PTR_ERR(fsdata);
874 + }
875 +
876 + if (fsdata->ra2.readpages)
877 + fsdata->ra2.readpages(mapping, pages, fsdata->ra2.data);
878 + else {
879 + /*
880 + * filler (reiser4 readpage method) may involve tree search
881 + * which is not allowed when lock stack is not clean. If lock
882 + * stack is not clean - do nothing.
883 + */
884 + if (lock_stack_isclean(get_current_lock_stack()))
885 + read_cache_pages(mapping, pages, filler, file);
886 + else {
887 + while (!list_empty(pages)) {
888 + struct page *victim;
889 +
890 + victim = list_entry(pages->prev, struct page, lru);
891 + list_del(&victim->lru);
892 + page_cache_release(victim);
893 + }
894 + }
895 + }
896 + reiser4_exit_context(ctx);
897 + return 0;
898 +}
899 +
900 +/* ->invalidatepage method for reiser4 */
901 +
902 +/*
903 + * this is called for each truncated page from
904 + * truncate_inode_pages()->truncate_{complete,partial}_page().
905 + *
906 + * At the moment of call, page is under lock, and outstanding io (if any) has
907 + * completed.
908 + */
909 +
910 +/**
911 + * reiser4_invalidatepage
912 + * @page: page to invalidate
913 + * @offset: starting offset for partial invalidation
914 + *
915 + */
916 +int reiser4_invalidatepage(struct page *page, unsigned long offset)
917 +{
918 + int ret = 0;
919 + reiser4_context *ctx;
920 + struct inode *inode;
921 + jnode *node;
922 +
923 + /*
924 + * This is called to truncate file's page.
925 + *
926 + * Originally, reiser4 implemented truncate in a standard way
927 + * (vmtruncate() calls ->invalidatepage() on all truncated pages
928 + * first, then file system ->truncate() call-back is invoked).
929 + *
930 + * This lead to the problem when ->invalidatepage() was called on a
931 + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
932 + * process. That is, truncate was bypassing transactions. To avoid
933 + * this, try_capture_page_to_invalidate() call was added here.
934 + *
935 + * After many troubles with vmtruncate() based truncate (including
936 + * races with flush, tail conversion, etc.) it was re-written in the
937 + * top-to-bottom style: items are killed in cut_tree_object() and
938 + * pages belonging to extent are invalidated in kill_hook_extent(). So
939 + * probably now additional call to capture is not needed here.
940 + */
941 +
942 + assert("nikita-3137", PageLocked(page));
943 + assert("nikita-3138", !PageWriteback(page));
944 + inode = page->mapping->host;
945 +
946 + /*
947 + * ->invalidatepage() should only be called for the unformatted
948 + * jnodes. Destruction of all other types of jnodes is performed
949 + * separately. But, during some corner cases (like handling errors
950 + * during mount) it is simpler to let ->invalidatepage to be called on
951 + * them. Check for this, and do nothing.
952 + */
953 + if (get_super_fake(inode->i_sb) == inode)
954 + return 0;
955 + if (get_cc_fake(inode->i_sb) == inode)
956 + return 0;
957 + if (get_bitmap_fake(inode->i_sb) == inode)
958 + return 0;
959 + assert("vs-1426", PagePrivate(page));
960 + assert("vs-1427",
961 + page->mapping == jnode_get_mapping(jnode_by_page(page)));
962 + assert("", jprivate(page) != NULL);
963 + assert("", ergo(inode_file_plugin(inode) !=
964 + file_plugin_by_id(CRC_FILE_PLUGIN_ID), offset == 0));
965 +
966 + ctx = init_context(inode->i_sb);
967 + if (IS_ERR(ctx))
968 + return PTR_ERR(ctx);
969 +
970 + node = jprivate(page);
971 + spin_lock_jnode(node);
972 + if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
973 + (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
974 + /* there is not need to capture */
975 + jref(node);
976 + JF_SET(node, JNODE_HEARD_BANSHEE);
977 + page_clear_jnode(page, node);
978 + uncapture_jnode(node);
979 + unhash_unformatted_jnode(node);
980 + jput(node);
981 + reiser4_exit_context(ctx);
982 + return 0;
983 + }
984 + spin_unlock_jnode(node);
985 +
986 + /* capture page being truncated. */
987 + ret = try_capture_page_to_invalidate(page);
988 + if (ret != 0)
989 + warning("nikita-3141", "Cannot capture: %i", ret);
990 +
991 + if (offset == 0) {
992 + /* remove jnode from transaction and detach it from page. */
993 + jref(node);
994 + JF_SET(node, JNODE_HEARD_BANSHEE);
995 + /* page cannot be detached from jnode concurrently, because it
996 + * is locked */
997 + uncapture_page(page);
998 +
999 + /* this detaches page from jnode, so that jdelete will not try
1000 + * to lock page which is already locked */
1001 + spin_lock_jnode(node);
1002 + page_clear_jnode(page, node);
1003 + spin_unlock_jnode(node);
1004 + unhash_unformatted_jnode(node);
1005 +
1006 + jput(node);
1007 + }
1008 +
1009 + reiser4_exit_context(ctx);
1010 + return 0;
1011 +}
1012 +
1013 +/* help function called from reiser4_releasepage(). It returns true if jnode
1014 + * can be detached from its page and page released. */
1015 +int jnode_is_releasable(jnode * node /* node to check */ )
1016 +{
1017 + assert("nikita-2781", node != NULL);
1018 + assert_spin_locked(&(node->guard));
1019 + assert_spin_locked(&(node->load));
1020 +
1021 + /* is some thread is currently using jnode page, later cannot be
1022 + * detached */
1023 + if (atomic_read(&node->d_count) != 0) {
1024 + return 0;
1025 + }
1026 +
1027 + assert("vs-1214", !jnode_is_loaded(node));
1028 +
1029 + /*
1030 + * can only release page if real block number is assigned to it. Simple
1031 + * check for ->atom wouldn't do, because it is possible for node to be
1032 + * clean, not it atom yet, and still having fake block number. For
1033 + * example, node just created in jinit_new().
1034 + */
1035 + if (blocknr_is_fake(jnode_get_block(node)))
1036 + return 0;
1037 +
1038 + /*
1039 + * pages prepared for write can not be released anyway, so avoid
1040 + * detaching jnode from the page
1041 + */
1042 + if (JF_ISSET(node, JNODE_WRITE_PREPARED))
1043 + return 0;
1044 +
1045 + /*
1046 + * dirty jnode cannot be released. It can however be submitted to disk
1047 + * as part of early flushing, but only after getting flush-prepped.
1048 + */
1049 + if (JF_ISSET(node, JNODE_DIRTY))
1050 + return 0;
1051 +
1052 + /* overwrite set is only written by log writer. */
1053 + if (JF_ISSET(node, JNODE_OVRWR))
1054 + return 0;
1055 +
1056 + /* jnode is already under writeback */
1057 + if (JF_ISSET(node, JNODE_WRITEBACK))
1058 + return 0;
1059 +
1060 + /* don't flush bitmaps or journal records */
1061 + if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
1062 + return 0;
1063 +
1064 + return 1;
1065 +}
1066 +
1067 +/*
1068 + * ->releasepage method for reiser4
1069 + *
1070 + * This is called by VM scanner when it comes across clean page. What we have
1071 + * to do here is to check whether page can really be released (freed that is)
1072 + * and if so, detach jnode from it and remove page from the page cache.
1073 + *
1074 + * Check for releasability is done by releasable() function.
1075 + */
1076 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
1077 +{
1078 + jnode *node;
1079 +
1080 + assert("nikita-2257", PagePrivate(page));
1081 + assert("nikita-2259", PageLocked(page));
1082 + assert("nikita-2892", !PageWriteback(page));
1083 + assert("nikita-3019", schedulable());
1084 +
1085 + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
1086 + is not clear what to do in this case. A lot of deadlocks seems be
1087 + possible. */
1088 +
1089 + node = jnode_by_page(page);
1090 + assert("nikita-2258", node != NULL);
1091 + assert("reiser4-4", page->mapping != NULL);
1092 + assert("reiser4-5", page->mapping->host != NULL);
1093 +
1094 + if (PageDirty(page))
1095 + return 0;
1096 +
1097 + /* releasable() needs jnode lock, because it looks at the jnode fields
1098 + * and we need jload_lock here to avoid races with jload(). */
1099 + spin_lock_jnode(node);
1100 + spin_lock(&(node->load));
1101 + if (jnode_is_releasable(node)) {
1102 + struct address_space *mapping;
1103 +
1104 + mapping = page->mapping;
1105 + jref(node);
1106 + /* there is no need to synchronize against
1107 + * jnode_extent_write() here, because pages seen by
1108 + * jnode_extent_write() are !releasable(). */
1109 + page_clear_jnode(page, node);
1110 + spin_unlock(&(node->load));
1111 + spin_unlock_jnode(node);
1112 +
1113 + /* we are under memory pressure so release jnode also. */
1114 + jput(node);
1115 +
1116 + return 1;
1117 + } else {
1118 + spin_unlock(&(node->load));
1119 + spin_unlock_jnode(node);
1120 + assert("nikita-3020", schedulable());
1121 + return 0;
1122 + }
1123 +}
1124 +
1125 +/* Make Linus happy.
1126 + Local variables:
1127 + c-indentation-style: "K&R"
1128 + mode-name: "LC"
1129 + c-basic-offset: 8
1130 + tab-width: 8
1131 + fill-column: 120
1132 + End:
1133 +*/
1134 diff -puN /dev/null fs/reiser4/block_alloc.c
1135 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
1136 +++ linux-2.6.16-3-vs/fs/reiser4/block_alloc.c 2006-05-30 18:51:50.076560500 +0400
1137 @@ -0,0 +1,1139 @@
1138 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1139 +
1140 +#include "debug.h"
1141 +#include "dformat.h"
1142 +#include "plugin/plugin.h"
1143 +#include "txnmgr.h"
1144 +#include "znode.h"
1145 +#include "block_alloc.h"
1146 +#include "tree.h"
1147 +#include "super.h"
1148 +
1149 +#include <linux/types.h> /* for __u?? */
1150 +#include <linux/fs.h> /* for struct super_block */
1151 +#include <linux/spinlock.h>
1152 +
1153 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
1154 +
1155 +/* We need to be able to reserve enough disk space to ensure that an atomic
1156 + operation will have enough disk space to flush (see flush.c and
1157 + http://namesys.com/v4/v4.html) and commit it once it is started.
1158 +
1159 + In our design a call for reserving disk space may fail but not an actual
1160 + block allocation.
1161 +
1162 + All free blocks, already allocated blocks, and all kinds of reserved blocks
1163 + are counted in different per-fs block counters.
1164 +
1165 + A reiser4 super block's set of block counters currently is:
1166 +
1167 + free -- free blocks,
1168 + used -- already allocated blocks,
1169 +
1170 + grabbed -- initially reserved for performing an fs operation, those blocks
1171 + are taken from free blocks, then grabbed disk space leaks from grabbed
1172 + blocks counter to other counters like "fake allocated", "flush
1173 + reserved", "used", the rest of not used grabbed space is returned to
1174 + free space at the end of fs operation;
1175 +
1176 + fake allocated -- counts all nodes without real disk block numbers assigned,
1177 + we have separate accounting for formatted and unformatted
1178 + nodes (for easier debugging);
1179 +
1180 + flush reserved -- disk space needed for flushing and committing an atom.
1181 + Each dirty already allocated block could be written as a
1182 + part of atom's overwrite set or as a part of atom's
1183 + relocate set. In both case one additional block is needed,
1184 + it is used as a wandered block if we do overwrite or as a
1185 + new location for a relocated block.
1186 +
1187 + In addition, blocks in some states are counted on per-thread and per-atom
1188 + basis. A reiser4 context has a counter of blocks grabbed by this transaction
1189 + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
1190 + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
1191 + blocks, which are reserved for flush processing and atom commit. */
1192 +
1193 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
1194 + number of blocks to grab for most expensive case of balancing when the leaf
1195 + node we insert new item to gets split and new leaf node is allocated.
1196 +
1197 + So, we need to grab blocks for
1198 +
1199 + 1) one block for possible dirtying the node we insert an item to. That block
1200 + would be used for node relocation at flush time or for allocating of a
1201 + wandered one, it depends what will be a result (what set, relocate or
1202 + overwrite the node gets assigned to) of the node processing by the flush
1203 + algorithm.
1204 +
1205 + 2) one block for either allocating a new node, or dirtying of right or left
1206 + clean neighbor, only one case may happen.
1207 +
1208 + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
1209 + node, and creation of new node. have I forgotten something? email me.
1210 +
1211 + These grabbed blocks are counted in both reiser4 context "grabbed blocks"
1212 + counter and in the fs-wide one (both ctx->grabbed_blocks and
1213 + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
1214 + decremented by 2.
1215 +
1216 + Suppose both two blocks were spent for dirtying of an already allocated clean
1217 + node (one block went from "grabbed" to "flush reserved") and for new block
1218 + allocating (one block went from "grabbed" to "fake allocated formatted").
1219 +
1220 + Inserting of a child pointer to the parent node caused parent node to be
1221 + split, the balancing code takes care about this grabbing necessary space
1222 + immediately by calling reiser4_grab with BA_RESERVED flag set which means
1223 + "can use the 5% reserved disk space".
1224 +
1225 + At this moment insertion completes and grabbed blocks (if they were not used)
1226 + should be returned to the free space counter.
1227 +
1228 + However the atom life-cycle is not completed. The atom had one "flush
1229 + reserved" block added by our insertion and the new fake allocated node is
1230 + counted as a "fake allocated formatted" one. The atom has to be fully
1231 + processed by flush before commit. Suppose that the flush moved the first,
1232 + already allocated node to the atom's overwrite list, the new fake allocated
1233 + node, obviously, went into the atom relocate set. The reiser4 flush
1234 + allocates the new node using one unit from "fake allocated formatted"
1235 + counter, the log writer uses one from "flush reserved" for wandered block
1236 + allocation.
1237 +
1238 + And, it is not the end. When the wandered block is deallocated after the
1239 + atom gets fully played (see wander.c for term description), the disk space
1240 + occupied for it is returned to free blocks. */
1241 +
1242 +/* BLOCK NUMBERS */
1243 +
1244 +/* Any reiser4 node has a block number assigned to it. We use these numbers for
1245 + indexing in hash tables, so if a block has not yet been assigned a location
1246 + on disk we need to give it a temporary fake block number.
1247 +
1248 + Current implementation of reiser4 uses 64-bit integers for block numbers. We
1249 + use highest bit in 64-bit block number to distinguish fake and real block
1250 + numbers. So, only 63 bits may be used to addressing of real device
1251 + blocks. That "fake" block numbers space is divided into subspaces of fake
1252 + block numbers for data blocks and for shadow (working) bitmap blocks.
1253 +
1254 + Fake block numbers for data blocks are generated by a cyclic counter, which
1255 + gets incremented after each real block allocation. We assume that it is
1256 + impossible to overload this counter during one transaction life. */
1257 +
1258 +/* Initialize a blocknr hint. */
1259 +void blocknr_hint_init(reiser4_blocknr_hint * hint)
1260 +{
1261 + memset(hint, 0, sizeof(reiser4_blocknr_hint));
1262 +}
1263 +
1264 +/* Release any resources of a blocknr hint. */
1265 +void blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1266 +{
1267 + /* No resources should be freed in current blocknr_hint implementation. */
1268 +}
1269 +
1270 +/* see above for explanation of fake block number. */
1271 +/* Audited by: green(2002.06.11) */
1272 +int blocknr_is_fake(const reiser4_block_nr * da)
1273 +{
1274 + /* The reason for not simply returning result of '&' operation is that
1275 + while return value is (possibly 32bit) int, the reiser4_block_nr is
1276 + at least 64 bits long, and high bit (which is the only possible
1277 + non zero bit after the masking) would be stripped off */
1278 + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1279 +}
1280 +
1281 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1282 + arithmetic. Mostly, they are isolated to not to code same assertions in
1283 + several places. */
1284 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
1285 +{
1286 + BUG_ON(ctx->grabbed_blocks < count);
1287 + assert("zam-527", ctx->grabbed_blocks >= count);
1288 + ctx->grabbed_blocks -= count;
1289 +}
1290 +
1291 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
1292 +{
1293 + ctx->grabbed_blocks += count;
1294 +}
1295 +
1296 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
1297 +{
1298 + assert("zam-525", sbinfo->blocks_grabbed >= count);
1299 + sbinfo->blocks_grabbed -= count;
1300 +}
1301 +
1302 +/* Decrease the counter of block reserved for flush in super block. */
1303 +static void
1304 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1305 +{
1306 + assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
1307 + sbinfo->blocks_flush_reserved -= count;
1308 +}
1309 +
1310 +static void
1311 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1312 + reiser4_ba_flags_t flags)
1313 +{
1314 + if (flags & BA_FORMATTED) {
1315 + assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1316 + sbinfo->blocks_fake_allocated -= count;
1317 + } else {
1318 + assert("zam-528",
1319 + sbinfo->blocks_fake_allocated_unformatted >= count);
1320 + sbinfo->blocks_fake_allocated_unformatted -= count;
1321 + }
1322 +}
1323 +
1324 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
1325 +{
1326 + assert("zam-530",
1327 + sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1328 + sbinfo->blocks_used -= count;
1329 +}
1330 +
1331 +static void
1332 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1333 +{
1334 + assert("edward-501", sbinfo->blocks_clustered >= count);
1335 + sbinfo->blocks_clustered -= count;
1336 +}
1337 +
1338 +/* Increase the counter of block reserved for flush in atom. */
1339 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1340 +{
1341 + assert("zam-772", atom != NULL);
1342 + assert_spin_locked(&(atom->alock));
1343 + atom->flush_reserved += count;
1344 +}
1345 +
1346 +/* Decrease the counter of block reserved for flush in atom. */
1347 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1348 +{
1349 + assert("zam-774", atom != NULL);
1350 + assert_spin_locked(&(atom->alock));
1351 + assert("nikita-2790", atom->flush_reserved >= count);
1352 + atom->flush_reserved -= count;
1353 +}
1354 +
1355 +/* super block has 6 counters: free, used, grabbed, fake allocated
1356 + (formatted and unformatted) and flush reserved. Their sum must be
1357 + number of blocks on a device. This function checks this */
1358 +int check_block_counters(const struct super_block *super)
1359 +{
1360 + __u64 sum;
1361 +
1362 + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1363 + reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1364 + reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
1365 + reiser4_clustered_blocks(super);
1366 + if (reiser4_block_count(super) != sum) {
1367 + printk("super block counters: "
1368 + "used %llu, free %llu, "
1369 + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1370 + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1371 + (unsigned long long)reiser4_data_blocks(super),
1372 + (unsigned long long)reiser4_free_blocks(super),
1373 + (unsigned long long)reiser4_grabbed_blocks(super),
1374 + (unsigned long long)reiser4_fake_allocated(super),
1375 + (unsigned long long)
1376 + reiser4_fake_allocated_unformatted(super),
1377 + (unsigned long long)flush_reserved(super),
1378 + (unsigned long long)reiser4_clustered_blocks(super),
1379 + (unsigned long long)sum,
1380 + (unsigned long long)reiser4_block_count(super));
1381 + return 0;
1382 + }
1383 + return 1;
1384 +}
1385 +
1386 +/* Adjust "working" free blocks counter for number of blocks we are going to
1387 + allocate. Record number of grabbed blocks in fs-wide and per-thread
1388 + counters. This function should be called before bitmap scanning or
1389 + allocating fake block numbers
1390 +
1391 + @super -- pointer to reiser4 super block;
1392 + @count -- number of blocks we reserve;
1393 +
1394 + @return -- 0 if success, -ENOSPC, if all
1395 + free blocks are preserved or already allocated.
1396 +*/
1397 +
1398 +static int
1399 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1400 +{
1401 + __u64 free_blocks;
1402 + int ret = 0, use_reserved = flags & BA_RESERVED;
1403 + reiser4_super_info_data *sbinfo;
1404 +
1405 + assert("vs-1276", ctx == get_current_context());
1406 +
1407 + /* Do not grab anything on ro-mounted fs. */
1408 + if (rofs_super(ctx->super)) {
1409 + ctx->grab_enabled = 0;
1410 + return 0;
1411 + }
1412 +
1413 + sbinfo = get_super_private(ctx->super);
1414 +
1415 + spin_lock_reiser4_super(sbinfo);
1416 +
1417 + free_blocks = sbinfo->blocks_free;
1418 +
1419 + if ((use_reserved && free_blocks < count) ||
1420 + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1421 + ret = RETERR(-ENOSPC);
1422 + goto unlock_and_ret;
1423 + }
1424 +
1425 + add_to_ctx_grabbed(ctx, count);
1426 +
1427 + sbinfo->blocks_grabbed += count;
1428 + sbinfo->blocks_free -= count;
1429 +
1430 +#if REISER4_DEBUG
1431 + if (ctx->grabbed_initially == 0)
1432 + ctx->grabbed_initially = count;
1433 +#endif
1434 +
1435 + assert("nikita-2986", check_block_counters(ctx->super));
1436 +
1437 + /* disable grab space in current context */
1438 + ctx->grab_enabled = 0;
1439 +
1440 + unlock_and_ret:
1441 + spin_unlock_reiser4_super(sbinfo);
1442 +
1443 + return ret;
1444 +}
1445 +
1446 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1447 +{
1448 + int ret;
1449 + reiser4_context *ctx;
1450 +
1451 + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1452 + lock_stack_isclean(get_current_lock_stack
1453 + ())));
1454 + ctx = get_current_context();
1455 + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1456 + return 0;
1457 + }
1458 +
1459 + ret = reiser4_grab(ctx, count, flags);
1460 + if (ret == -ENOSPC) {
1461 +
1462 + /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1463 + if (flags & BA_CAN_COMMIT) {
1464 + txnmgr_force_commit_all(ctx->super, 0);
1465 + ctx->grab_enabled = 1;
1466 + ret = reiser4_grab(ctx, count, flags);
1467 + }
1468 + }
1469 + /*
1470 + * allocation from reserved pool cannot fail. This is severe error.
1471 + */
1472 + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1473 + return ret;
1474 +}
1475 +
1476 +/*
1477 + * SPACE RESERVED FOR UNLINK/TRUNCATE
1478 + *
1479 + * Unlink and truncate require space in transaction (to update stat data, at
1480 + * least). But we don't want rm(1) to fail with "No space on device" error.
1481 + *
1482 + * Solution is to reserve 5% of disk space for truncates and
1483 + * unlinks. Specifically, normal space grabbing requests don't grab space from
1484 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1485 + * drain it. Per super block delete_sema semaphore is used to allow only one
1486 + * thread at a time to grab from reserved area.
1487 + *
1488 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1489 + * flag.
1490 + *
1491 + */
1492 +
1493 +int reiser4_grab_reserved(struct super_block *super,
1494 + __u64 count, reiser4_ba_flags_t flags)
1495 +{
1496 + reiser4_super_info_data *sbinfo = get_super_private(super);
1497 +
1498 + assert("nikita-3175", flags & BA_CAN_COMMIT);
1499 +
1500 + /* Check the delete semaphore already taken by us, we assume that
1501 + * reading of machine word is atomic. */
1502 + if (sbinfo->delete_sema_owner == current) {
1503 + if (reiser4_grab_space
1504 + (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1505 + warning("zam-1003",
1506 + "nested call of grab_reserved fails count=(%llu)",
1507 + (unsigned long long)count);
1508 + reiser4_release_reserved(super);
1509 + return RETERR(-ENOSPC);
1510 + }
1511 + return 0;
1512 + }
1513 +
1514 + if (reiser4_grab_space(count, flags)) {
1515 + down(&sbinfo->delete_sema);
1516 + assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
1517 + sbinfo->delete_sema_owner = current;
1518 +
1519 + if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1520 + warning("zam-833",
1521 + "reserved space is not enough (%llu)",
1522 + (unsigned long long)count);
1523 + reiser4_release_reserved(super);
1524 + return RETERR(-ENOSPC);
1525 + }
1526 + }
1527 + return 0;
1528 +}
1529 +
1530 +void reiser4_release_reserved(struct super_block *super)
1531 +{
1532 + reiser4_super_info_data *info;
1533 +
1534 + info = get_super_private(super);
1535 + if (info->delete_sema_owner == current) {
1536 + info->delete_sema_owner = NULL;
1537 + up(&info->delete_sema);
1538 + }
1539 +}
1540 +
1541 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1542 +{
1543 + reiser4_context *ctx;
1544 + reiser4_super_info_data *sbinfo;
1545 +
1546 + ctx = get_current_context();
1547 + sub_from_ctx_grabbed(ctx, count);
1548 +
1549 + sbinfo = get_super_private(ctx->super);
1550 + spin_lock_reiser4_super(sbinfo);
1551 +
1552 + sub_from_sb_grabbed(sbinfo, count);
1553 + /* return sbinfo locked */
1554 + return sbinfo;
1555 +}
1556 +
1557 +/* is called after @count fake block numbers are allocated and pointer to
1558 + those blocks are inserted into tree. */
1559 +static void grabbed2fake_allocated_formatted(void)
1560 +{
1561 + reiser4_super_info_data *sbinfo;
1562 +
1563 + sbinfo = grabbed2fake_allocated_head(1);
1564 + sbinfo->blocks_fake_allocated++;
1565 +
1566 + assert("vs-922", check_block_counters(reiser4_get_current_sb()));
1567 +
1568 + spin_unlock_reiser4_super(sbinfo);
1569 +}
1570 +
1571 +/**
1572 + * grabbed2fake_allocated_unformatted
1573 + * @count:
1574 + *
1575 + */
1576 +static void grabbed2fake_allocated_unformatted(int count)
1577 +{
1578 + reiser4_super_info_data *sbinfo;
1579 +
1580 + sbinfo = grabbed2fake_allocated_head(count);
1581 + sbinfo->blocks_fake_allocated_unformatted += count;
1582 +
1583 + assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
1584 +
1585 + spin_unlock_reiser4_super(sbinfo);
1586 +}
1587 +
1588 +void grabbed2cluster_reserved(int count)
1589 +{
1590 + reiser4_context *ctx;
1591 + reiser4_super_info_data *sbinfo;
1592 +
1593 + ctx = get_current_context();
1594 + sub_from_ctx_grabbed(ctx, count);
1595 +
1596 + sbinfo = get_super_private(ctx->super);
1597 + spin_lock_reiser4_super(sbinfo);
1598 +
1599 + sub_from_sb_grabbed(sbinfo, count);
1600 + sbinfo->blocks_clustered += count;
1601 +
1602 + assert("edward-504", check_block_counters(ctx->super));
1603 +
1604 + spin_unlock_reiser4_super(sbinfo);
1605 +}
1606 +
1607 +void cluster_reserved2grabbed(int count)
1608 +{
1609 + reiser4_context *ctx;
1610 + reiser4_super_info_data *sbinfo;
1611 +
1612 + ctx = get_current_context();
1613 +
1614 + sbinfo = get_super_private(ctx->super);
1615 + spin_lock_reiser4_super(sbinfo);
1616 +
1617 + sub_from_cluster_reserved(sbinfo, count);
1618 + sbinfo->blocks_grabbed += count;
1619 +
1620 + assert("edward-505", check_block_counters(ctx->super));
1621 +
1622 + spin_unlock_reiser4_super(sbinfo);
1623 + add_to_ctx_grabbed(ctx, count);
1624 +}
1625 +
1626 +void cluster_reserved2free(int count)
1627 +{
1628 + reiser4_context *ctx;
1629 + reiser4_super_info_data *sbinfo;
1630 +
1631 + assert("edward-503", get_current_context()->grabbed_blocks == 0);
1632 +
1633 + ctx = get_current_context();
1634 + sbinfo = get_super_private(ctx->super);
1635 + spin_lock_reiser4_super(sbinfo);
1636 +
1637 + sub_from_cluster_reserved(sbinfo, count);
1638 + sbinfo->blocks_free += count;
1639 +
1640 + assert("edward-502", check_block_counters(ctx->super));
1641 +
1642 + spin_unlock_reiser4_super(sbinfo);
1643 +}
1644 +
1645 +static DEFINE_SPINLOCK(fake_lock);
1646 +static reiser4_block_nr fake_gen = 0;
1647 +
1648 +/**
1649 + * assign_fake_blocknr
1650 + * @blocknr:
1651 + * @count:
1652 + *
1653 + * Obtain a fake block number for new node which will be used to refer to
1654 + * this newly allocated node until real allocation is done.
1655 + */
1656 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1657 +{
1658 + spin_lock(&fake_lock);
1659 + *blocknr = fake_gen;
1660 + fake_gen += count;
1661 + spin_unlock(&fake_lock);
1662 +
1663 + BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1664 + /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1665 + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1666 + assert("zam-394", zlook(current_tree, blocknr) == NULL);
1667 +}
1668 +
1669 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1670 +{
1671 + assign_fake_blocknr(blocknr, 1);
1672 + grabbed2fake_allocated_formatted();
1673 + return 0;
1674 +}
1675 +
1676 +/**
1677 + * fake_blocknrs_unformatted
1678 + * @count: number of fake numbers to get
1679 + *
1680 + * Allocates @count fake block numbers which will be assigned to jnodes
1681 + */
1682 +reiser4_block_nr fake_blocknr_unformatted(int count)
1683 +{
1684 + reiser4_block_nr blocknr;
1685 +
1686 + assign_fake_blocknr(&blocknr, count);
1687 + grabbed2fake_allocated_unformatted(count);
1688 +
1689 + return blocknr;
1690 +}
1691 +
1692 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1693 + follows grabbing of free disk space. */
1694 +void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1695 + __u64 count)
1696 +{
1697 + sub_from_ctx_grabbed(ctx, count);
1698 +
1699 + spin_lock_reiser4_super(sbinfo);
1700 +
1701 + sub_from_sb_grabbed(sbinfo, count);
1702 + sbinfo->blocks_used += count;
1703 +
1704 + assert("nikita-2679", check_block_counters(ctx->super));
1705 +
1706 + spin_unlock_reiser4_super(sbinfo);
1707 +}
1708 +
1709 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1710 +void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1711 + reiser4_ba_flags_t flags)
1712 +{
1713 + spin_lock_reiser4_super(sbinfo);
1714 +
1715 + sub_from_sb_fake_allocated(sbinfo, count, flags);
1716 + sbinfo->blocks_used += count;
1717 +
1718 + assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
1719 +
1720 + spin_unlock_reiser4_super(sbinfo);
1721 +}
1722 +
1723 +void flush_reserved2used(txn_atom * atom, __u64 count)
1724 +{
1725 + reiser4_super_info_data *sbinfo;
1726 +
1727 + assert("zam-787", atom != NULL);
1728 + assert_spin_locked(&(atom->alock));
1729 +
1730 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1731 +
1732 + sbinfo = get_current_super_private();
1733 + spin_lock_reiser4_super(sbinfo);
1734 +
1735 + sub_from_sb_flush_reserved(sbinfo, count);
1736 + sbinfo->blocks_used += count;
1737 +
1738 + assert("zam-789", check_block_counters(reiser4_get_current_sb()));
1739 +
1740 + spin_unlock_reiser4_super(sbinfo);
1741 +}
1742 +
1743 +/* update the per fs blocknr hint default value. */
1744 +void
1745 +update_blocknr_hint_default(const struct super_block *s,
1746 + const reiser4_block_nr * block)
1747 +{
1748 + reiser4_super_info_data *sbinfo = get_super_private(s);
1749 +
1750 + assert("nikita-3342", !blocknr_is_fake(block));
1751 +
1752 + spin_lock_reiser4_super(sbinfo);
1753 + if (*block < sbinfo->block_count) {
1754 + sbinfo->blocknr_hint_default = *block;
1755 + } else {
1756 + warning("zam-676",
1757 + "block number %llu is too large to be used in a blocknr hint\n",
1758 + (unsigned long long)*block);
1759 + dump_stack();
1760 + DEBUGON(1);
1761 + }
1762 + spin_unlock_reiser4_super(sbinfo);
1763 +}
1764 +
1765 +/* get current value of the default blocknr hint. */
1766 +void get_blocknr_hint_default(reiser4_block_nr * result)
1767 +{
1768 + reiser4_super_info_data *sbinfo = get_current_super_private();
1769 +
1770 + spin_lock_reiser4_super(sbinfo);
1771 + *result = sbinfo->blocknr_hint_default;
1772 + assert("zam-677", *result < sbinfo->block_count);
1773 + spin_unlock_reiser4_super(sbinfo);
1774 +}
1775 +
1776 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1777 + * method. Blocks are allocated in one contiguous disk region. The plugin
1778 + * independent part accounts blocks by subtracting allocated amount from grabbed
1779 + * or fake block counter and add the same amount to the counter of allocated
1780 + * blocks.
1781 + *
1782 + * @hint -- a reiser4 blocknr hint object which contains further block
1783 + * allocation hints and parameters (search start, a stage of block
1784 + * which will be mapped to disk, etc.),
1785 + * @blk -- an out parameter for the beginning of the allocated region,
1786 + * @len -- in/out parameter, it should contain the maximum number of allocated
1787 + * blocks, after block allocation completes, it contains the length of
1788 + * allocated disk region.
1789 + * @flags -- see reiser4_ba_flags_t description.
1790 + *
1791 + * @return -- 0 if success, error code otherwise.
1792 + */
1793 +int
1794 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1795 + reiser4_block_nr * len, reiser4_ba_flags_t flags)
1796 +{
1797 + __u64 needed = *len;
1798 + reiser4_context *ctx;
1799 + reiser4_super_info_data *sbinfo;
1800 + int ret;
1801 +
1802 + assert("zam-986", hint != NULL);
1803 +
1804 + ctx = get_current_context();
1805 + sbinfo = get_super_private(ctx->super);
1806 +
1807 + /* For write-optimized data we use default search start value, which is
1808 + * close to last write location. */
1809 + if (flags & BA_USE_DEFAULT_SEARCH_START) {
1810 + get_blocknr_hint_default(&hint->blk);
1811 + }
1812 +
1813 + /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1814 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1815 + if (hint->block_stage == BLOCK_NOT_COUNTED) {
1816 + ret = reiser4_grab_space_force(*len, flags);
1817 + if (ret != 0)
1818 + return ret;
1819 + }
1820 +
1821 + ret =
1822 + sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int)needed,
1823 + blk, len);
1824 +
1825 + if (!ret) {
1826 + assert("zam-680", *blk < reiser4_block_count(ctx->super));
1827 + assert("zam-681",
1828 + *blk + *len <= reiser4_block_count(ctx->super));
1829 +
1830 + if (flags & BA_PERMANENT) {
1831 + /* we assume that current atom exists at this moment */
1832 + txn_atom *atom = get_current_atom_locked();
1833 + atom->nr_blocks_allocated += *len;
1834 + spin_unlock_atom(atom);
1835 + }
1836 +
1837 + switch (hint->block_stage) {
1838 + case BLOCK_NOT_COUNTED:
1839 + case BLOCK_GRABBED:
1840 + grabbed2used(ctx, sbinfo, *len);
1841 + break;
1842 + case BLOCK_UNALLOCATED:
1843 + fake_allocated2used(sbinfo, *len, flags);
1844 + break;
1845 + case BLOCK_FLUSH_RESERVED:
1846 + {
1847 + txn_atom *atom = get_current_atom_locked();
1848 + flush_reserved2used(atom, *len);
1849 + spin_unlock_atom(atom);
1850 + }
1851 + break;
1852 + default:
1853 + impossible("zam-531", "wrong block stage");
1854 + }
1855 + } else {
1856 + assert("zam-821",
1857 + ergo(hint->max_dist == 0
1858 + && !hint->backward, ret != -ENOSPC));
1859 + if (hint->block_stage == BLOCK_NOT_COUNTED)
1860 + grabbed2free(ctx, sbinfo, needed);
1861 + }
1862 +
1863 + return ret;
1864 +}
1865 +
1866 +/* used -> fake_allocated -> grabbed -> free */
1867 +
1868 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1869 + disk */
1870 +static void
1871 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1872 + int formatted)
1873 +{
1874 + spin_lock_reiser4_super(sbinfo);
1875 +
1876 + if (formatted)
1877 + sbinfo->blocks_fake_allocated += count;
1878 + else
1879 + sbinfo->blocks_fake_allocated_unformatted += count;
1880 +
1881 + sub_from_sb_used(sbinfo, count);
1882 +
1883 + assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1884 +
1885 + spin_unlock_reiser4_super(sbinfo);
1886 +}
1887 +
1888 +static void
1889 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1890 + __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1891 +{
1892 + assert("nikita-2791", atom != NULL);
1893 + assert_spin_locked(&(atom->alock));
1894 +
1895 + add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1896 +
1897 + spin_lock_reiser4_super(sbinfo);
1898 +
1899 + sbinfo->blocks_flush_reserved += count;
1900 + /*add_to_sb_flush_reserved(sbinfo, count); */
1901 + sub_from_sb_used(sbinfo, count);
1902 +
1903 + assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1904 +
1905 + spin_unlock_reiser4_super(sbinfo);
1906 +}
1907 +
1908 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1909 +static void
1910 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1911 + __u64 count, reiser4_ba_flags_t flags)
1912 +{
1913 + add_to_ctx_grabbed(ctx, count);
1914 +
1915 + spin_lock_reiser4_super(sbinfo);
1916 +
1917 + assert("nikita-2682", check_block_counters(ctx->super));
1918 +
1919 + sbinfo->blocks_grabbed += count;
1920 + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1921 +
1922 + assert("nikita-2683", check_block_counters(ctx->super));
1923 +
1924 + spin_unlock_reiser4_super(sbinfo);
1925 +}
1926 +
1927 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1928 +{
1929 + reiser4_context *ctx;
1930 + reiser4_super_info_data *sbinfo;
1931 +
1932 + ctx = get_current_context();
1933 + sbinfo = get_super_private(ctx->super);
1934 +
1935 + fake_allocated2grabbed(ctx, sbinfo, count, flags);
1936 + grabbed2free(ctx, sbinfo, count);
1937 +}
1938 +
1939 +void grabbed2free_mark(__u64 mark)
1940 +{
1941 + reiser4_context *ctx;
1942 + reiser4_super_info_data *sbinfo;
1943 +
1944 + ctx = get_current_context();
1945 + sbinfo = get_super_private(ctx->super);
1946 +
1947 + assert("nikita-3007", (__s64) mark >= 0);
1948 + assert("nikita-3006", ctx->grabbed_blocks >= mark);
1949 + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1950 +}
1951 +
1952 +/**
1953 + * grabbed2free - adjust grabbed and free block counters
1954 + * @ctx: context to update grabbed block counter of
1955 + * @sbinfo: super block to update grabbed and free block counters of
1956 + * @count: number of blocks to adjust counters by
1957 + *
1958 + * Decreases context's and per filesystem's counters of grabbed
1959 + * blocks. Increases per filesystem's counter of free blocks.
1960 + */
1961 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1962 + __u64 count)
1963 +{
1964 + sub_from_ctx_grabbed(ctx, count);
1965 +
1966 + spin_lock_reiser4_super(sbinfo);
1967 +
1968 + sub_from_sb_grabbed(sbinfo, count);
1969 + sbinfo->blocks_free += count;
1970 + assert("nikita-2684", check_block_counters(ctx->super));
1971 +
1972 + spin_unlock_reiser4_super(sbinfo);
1973 +}
1974 +
1975 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1976 +{
1977 + reiser4_context *ctx;
1978 + reiser4_super_info_data *sbinfo;
1979 +
1980 + assert("vs-1095", atom);
1981 +
1982 + ctx = get_current_context();
1983 + sbinfo = get_super_private(ctx->super);
1984 +
1985 + sub_from_ctx_grabbed(ctx, count);
1986 +
1987 + add_to_atom_flush_reserved_nolock(atom, count);
1988 +
1989 + spin_lock_reiser4_super(sbinfo);
1990 +
1991 + sbinfo->blocks_flush_reserved += count;
1992 + sub_from_sb_grabbed(sbinfo, count);
1993 +
1994 + assert("vpf-292", check_block_counters(ctx->super));
1995 +
1996 + spin_unlock_reiser4_super(sbinfo);
1997 +}
1998 +
1999 +void grabbed2flush_reserved(__u64 count)
2000 +{
2001 + txn_atom *atom = get_current_atom_locked();
2002 +
2003 + grabbed2flush_reserved_nolock(atom, count);
2004 +
2005 + spin_unlock_atom(atom);
2006 +}
2007 +
2008 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
2009 +{
2010 + reiser4_context *ctx;
2011 + reiser4_super_info_data *sbinfo;
2012 +
2013 + assert("nikita-2788", atom != NULL);
2014 + assert_spin_locked(&(atom->alock));
2015 +
2016 + ctx = get_current_context();
2017 + sbinfo = get_super_private(ctx->super);
2018 +
2019 + add_to_ctx_grabbed(ctx, count);
2020 +
2021 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
2022 +
2023 + spin_lock_reiser4_super(sbinfo);
2024 +
2025 + sbinfo->blocks_grabbed += count;
2026 + sub_from_sb_flush_reserved(sbinfo, count);
2027 +
2028 + assert("vpf-292", check_block_counters(ctx->super));
2029 +
2030 + spin_unlock_reiser4_super(sbinfo);
2031 +}
2032 +
2033 +/**
2034 + * all_grabbed2free - releases all blocks grabbed in context
2035 + *
2036 + * Decreases context's and super block's grabbed block counters by number of
2037 + * blocks grabbed by current context and increases super block's free block
2038 + * counter correspondingly.
2039 + */
2040 +void all_grabbed2free(void)
2041 +{
2042 + reiser4_context *ctx = get_current_context();
2043 +
2044 + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
2045 +}
2046 +
2047 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
2048 + after freeing, @count blocks become "grabbed". */
2049 +static void
2050 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
2051 + __u64 count)
2052 +{
2053 + add_to_ctx_grabbed(ctx, count);
2054 +
2055 + spin_lock_reiser4_super(sbinfo);
2056 +
2057 + sbinfo->blocks_grabbed += count;
2058 + sub_from_sb_used(sbinfo, count);
2059 +
2060 + assert("nikita-2685", check_block_counters(ctx->super));
2061 +
2062 + spin_unlock_reiser4_super(sbinfo);
2063 +}
2064 +
2065 +/* this used to be done through used2grabbed and grabbed2free*/
2066 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
2067 +{
2068 + spin_lock_reiser4_super(sbinfo);
2069 +
2070 + sbinfo->blocks_free += count;
2071 + sub_from_sb_used(sbinfo, count);
2072 +
2073 + assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
2074 +
2075 + spin_unlock_reiser4_super(sbinfo);
2076 +}
2077 +
2078 +#if REISER4_DEBUG
2079 +
2080 +/* check "allocated" state of given block range */
2081 +static void
2082 +reiser4_check_blocks(const reiser4_block_nr * start,
2083 + const reiser4_block_nr * len, int desired)
2084 +{
2085 + sa_check_blocks(start, len, desired);
2086 +}
2087 +
2088 +/* check "allocated" state of given block */
2089 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
2090 +{
2091 + const reiser4_block_nr one = 1;
2092 +
2093 + reiser4_check_blocks(block, &one, desired);
2094 +}
2095 +
2096 +#endif
2097 +
2098 +/* Blocks deallocation function may do an actual deallocation through space
2099 + plugin allocation or store deleted block numbers in atom's delete_set data
2100 + structure depend on @defer parameter. */
2101 +
2102 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
2103 + will be deleted from WORKING bitmap. They might be just unmapped from disk, or
2104 + freed but disk space is still grabbed by current thread, or these blocks must
2105 + not be counted in any reiser4 sb block counters, see block_stage_t comment */
2106 +
2107 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
2108 + distinguish blocks allocated for unformatted and formatted nodes */
2109 +
2110 +int
2111 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
2112 + const reiser4_block_nr * len,
2113 + block_stage_t target_stage, reiser4_ba_flags_t flags)
2114 +{
2115 + txn_atom *atom = NULL;
2116 + int ret;
2117 + reiser4_context *ctx;
2118 + reiser4_super_info_data *sbinfo;
2119 +
2120 + ctx = get_current_context();
2121 + sbinfo = get_super_private(ctx->super);
2122 +
2123 + if (REISER4_DEBUG) {
2124 + assert("zam-431", *len != 0);
2125 + assert("zam-432", *start != 0);
2126 + assert("zam-558", !blocknr_is_fake(start));
2127 +
2128 + spin_lock_reiser4_super(sbinfo);
2129 + assert("zam-562", *start < sbinfo->block_count);
2130 + spin_unlock_reiser4_super(sbinfo);
2131 + }
2132 +
2133 + if (flags & BA_DEFER) {
2134 + blocknr_set_entry *bsep = NULL;
2135 +
2136 + /* storing deleted block numbers in a blocknr set
2137 + datastructure for further actual deletion */
2138 + do {
2139 + atom = get_current_atom_locked();
2140 + assert("zam-430", atom != NULL);
2141 +
2142 + ret =
2143 + blocknr_set_add_extent(atom, &atom->delete_set,
2144 + &bsep, start, len);
2145 +
2146 + if (ret == -ENOMEM)
2147 + return ret;
2148 +
2149 + /* This loop might spin at most two times */
2150 + } while (ret == -E_REPEAT);
2151 +
2152 + assert("zam-477", ret == 0);
2153 + assert("zam-433", atom != NULL);
2154 +
2155 + spin_unlock_atom(atom);
2156 +
2157 + } else {
2158 + assert("zam-425", get_current_super_private() != NULL);
2159 + sa_dealloc_blocks(get_space_allocator(ctx->super), *start,
2160 + *len);
2161 +
2162 + if (flags & BA_PERMANENT) {
2163 + /* These blocks were counted as allocated, we have to revert it
2164 + * back if allocation is discarded. */
2165 + txn_atom *atom = get_current_atom_locked();
2166 + atom->nr_blocks_allocated -= *len;
2167 + spin_unlock_atom(atom);
2168 + }
2169 +
2170 + switch (target_stage) {
2171 + case BLOCK_NOT_COUNTED:
2172 + assert("vs-960", flags & BA_FORMATTED);
2173 + /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
2174 + used2free(sbinfo, *len);
2175 + break;
2176 +
2177 + case BLOCK_GRABBED:
2178 + used2grabbed(ctx, sbinfo, *len);
2179 + break;
2180 +
2181 + case BLOCK_UNALLOCATED:
2182 + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
2183 + break;
2184 +
2185 + case BLOCK_FLUSH_RESERVED:{
2186 + txn_atom *atom;
2187 +
2188 + atom = get_current_atom_locked();
2189 + used2flush_reserved(sbinfo, atom, *len,
2190 + flags & BA_FORMATTED);
2191 + spin_unlock_atom(atom);
2192 + break;
2193 + }
2194 + default:
2195 + impossible("zam-532", "wrong block stage");
2196 + }
2197 + }
2198 +
2199 + return 0;
2200 +}
2201 +
2202 +/* wrappers for block allocator plugin methods */
2203 +int pre_commit_hook(void)
2204 +{
2205 + assert("zam-502", get_current_super_private() != NULL);
2206 + sa_pre_commit_hook();
2207 + return 0;
2208 +}
2209 +
2210 +/* an actor which applies delete set to block allocator data */
2211 +static int
2212 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
2213 + const reiser4_block_nr * b, void *data UNUSED_ARG)
2214 +{
2215 + reiser4_context *ctx;
2216 + reiser4_super_info_data *sbinfo;
2217 +
2218 + __u64 len = 1;
2219 +
2220 + ctx = get_current_context();
2221 + sbinfo = get_super_private(ctx->super);
2222 +
2223 + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
2224 + assert("zam-552", sbinfo != NULL);
2225 +
2226 + if (b != NULL)
2227 + len = *b;
2228 +
2229 + if (REISER4_DEBUG) {
2230 + spin_lock_reiser4_super(sbinfo);
2231 +
2232 + assert("zam-554", *a < reiser4_block_count(ctx->super));
2233 + assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
2234 +
2235 + spin_unlock_reiser4_super(sbinfo);
2236 + }
2237 +
2238 + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
2239 + /* adjust sb block counters */
2240 + used2free(sbinfo, len);
2241 + return 0;
2242 +}
2243 +
2244 +void post_commit_hook(void)
2245 +{
2246 + txn_atom *atom;
2247 +
2248 + atom = get_current_atom_locked();
2249 + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2250 + spin_unlock_atom(atom);
2251 +
2252 + /* do the block deallocation which was deferred
2253 + until commit is done */
2254 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2255 +
2256 + assert("zam-504", get_current_super_private() != NULL);
2257 + sa_post_commit_hook();
2258 +}
2259 +
2260 +void post_write_back_hook(void)
2261 +{
2262 + assert("zam-504", get_current_super_private() != NULL);
2263 +
2264 + sa_post_commit_hook();
2265 +}
2266 +
2267 +/*
2268 + Local variables:
2269 + c-indentation-style: "K&R"
2270 + mode-name: "LC"
2271 + c-basic-offset: 8
2272 + tab-width: 8
2273 + fill-column: 120
2274 + scroll-step: 1
2275 + End:
2276 +*/
2277 diff -puN /dev/null fs/reiser4/block_alloc.h
2278 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
2279 +++ linux-2.6.16-3-vs/fs/reiser4/block_alloc.h 2006-05-30 18:51:50.076560500 +0400
2280 @@ -0,0 +1,175 @@
2281 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2282 +
2283 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2284 +#define __FS_REISER4_BLOCK_ALLOC_H__
2285 +
2286 +#include "dformat.h"
2287 +#include "forward.h"
2288 +
2289 +#include <linux/types.h> /* for __u?? */
2290 +#include <linux/fs.h>
2291 +
2292 +/* Mask when is applied to given block number shows is that block number is a fake one */
2293 +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
2294 +/* Mask which isolates a type of object this fake block number was assigned to */
2295 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2296 +
2297 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2298 + against these two values to understand is the object unallocated or bitmap
2299 + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2300 +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
2301 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
2302 +
2303 +/* specification how block allocation was counted in sb block counters */
2304 +typedef enum {
2305 + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
2306 + BLOCK_GRABBED = 1, /* free space grabbed for further allocation
2307 + of this block */
2308 + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
2309 + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
2310 + ( unallocated formatted or unformatted
2311 + node) */
2312 + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
2313 + number assigned */
2314 +} block_stage_t;
2315 +
2316 +/* a hint for block allocator */
2317 +struct reiser4_blocknr_hint {
2318 + /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
2319 + is to prevent jnode_flush() calls from interleaving allocations on the same
2320 + bitmap, once a hint is established. */
2321 +
2322 + /* search start hint */
2323 + reiser4_block_nr blk;
2324 + /* if not zero, it is a region size we search for free blocks in */
2325 + reiser4_block_nr max_dist;
2326 + /* level for allocation, may be useful have branch-level and higher
2327 + write-optimized. */
2328 + tree_level level;
2329 + /* block allocator assumes that blocks, which will be mapped to disk,
2330 + are in this specified block_stage */
2331 + block_stage_t block_stage;
2332 + /* If direction = 1 allocate blocks in backward direction from the end
2333 + * of disk to the beginning of disk. */
2334 + unsigned int backward:1;
2335 +
2336 +};
2337 +
2338 +/* These flags control block allocation/deallocation behavior */
2339 +enum reiser4_ba_flags {
2340 + /* do allocatations from reserved (5%) area */
2341 + BA_RESERVED = (1 << 0),
2342 +
2343 + /* block allocator can do commit trying to recover free space */
2344 + BA_CAN_COMMIT = (1 << 1),
2345 +
2346 + /* if operation will be applied to formatted block */
2347 + BA_FORMATTED = (1 << 2),
2348 +
2349 + /* defer actual block freeing until transaction commit */
2350 + BA_DEFER = (1 << 3),
2351 +
2352 + /* allocate blocks for permanent fs objects (formatted or unformatted), not
2353 + wandered of log blocks */
2354 + BA_PERMANENT = (1 << 4),
2355 +
2356 + /* grab space even it was disabled */
2357 + BA_FORCE = (1 << 5),
2358 +
2359 + /* use default start value for free blocks search. */
2360 + BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2361 +};
2362 +
2363 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2364 +
2365 +extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
2366 +extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
2367 +extern void update_blocknr_hint_default(const struct super_block *,
2368 + const reiser4_block_nr *);
2369 +extern void get_blocknr_hint_default(reiser4_block_nr *);
2370 +
2371 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
2372 +
2373 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
2374 +reiser4_block_nr fake_blocknr_unformatted(int);
2375 +
2376 +/* free -> grabbed -> fake_allocated -> used */
2377 +
2378 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
2379 +void all_grabbed2free(void);
2380 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
2381 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
2382 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2383 +void grabbed2flush_reserved(__u64 count);
2384 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
2385 + reiser4_block_nr * start,
2386 + reiser4_block_nr * len, reiser4_ba_flags_t flags);
2387 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
2388 + const reiser4_block_nr *,
2389 + block_stage_t, reiser4_ba_flags_t flags);
2390 +
2391 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2392 + reiser4_block_nr * start,
2393 + reiser4_ba_flags_t flags)
2394 +{
2395 + reiser4_block_nr one = 1;
2396 + return reiser4_alloc_blocks(hint, start, &one, flags);
2397 +}
2398 +
2399 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2400 + block_stage_t stage,
2401 + reiser4_ba_flags_t flags)
2402 +{
2403 + const reiser4_block_nr one = 1;
2404 + return reiser4_dealloc_blocks(block, &one, stage, flags);
2405 +}
2406 +
2407 +#define reiser4_grab_space_force(count, flags) \
2408 + reiser4_grab_space(count, flags | BA_FORCE)
2409 +
2410 +extern void grabbed2free_mark(__u64 mark);
2411 +extern int reiser4_grab_reserved(struct super_block *,
2412 + __u64, reiser4_ba_flags_t);
2413 +extern void reiser4_release_reserved(struct super_block *super);
2414 +
2415 +/* grabbed -> fake_allocated */
2416 +
2417 +/* fake_allocated -> used */
2418 +
2419 +/* used -> fake_allocated -> grabbed -> free */
2420 +
2421 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2422 +
2423 +extern int blocknr_is_fake(const reiser4_block_nr * da);
2424 +
2425 +extern void grabbed2cluster_reserved(int count);
2426 +extern void cluster_reserved2grabbed(int count);
2427 +extern void cluster_reserved2free(int count);
2428 +
2429 +extern int check_block_counters(const struct super_block *);
2430 +
2431 +#if REISER4_DEBUG
2432 +
2433 +extern void reiser4_check_block(const reiser4_block_nr *, int);
2434 +
2435 +#else
2436 +
2437 +# define reiser4_check_block(beg, val) noop
2438 +
2439 +#endif
2440 +
2441 +extern int pre_commit_hook(void);
2442 +extern void post_commit_hook(void);
2443 +extern void post_write_back_hook(void);
2444 +
2445 +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
2446 +
2447 +/* Make Linus happy.
2448 + Local variables:
2449 + c-indentation-style: "K&R"
2450 + mode-name: "LC"
2451 + c-basic-offset: 8
2452 + tab-width: 8
2453 + fill-column: 120
2454 + End:
2455 +*/
2456 diff -puN /dev/null fs/reiser4/blocknrset.c
2457 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
2458 +++ linux-2.6.16-3-vs/fs/reiser4/blocknrset.c 2006-05-30 18:51:50.080560750 +0400
2459 @@ -0,0 +1,368 @@
2460 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2461 +
2462 +/* This file contains code for various block number sets used by the atom to
2463 + track the deleted set and wandered block mappings. */
2464 +
2465 +#include "debug.h"
2466 +#include "dformat.h"
2467 +#include "txnmgr.h"
2468 +#include "context.h"
2469 +
2470 +#include <linux/slab.h>
2471 +
2472 +/* The proposed data structure for storing unordered block number sets is a
2473 + list of elements, each of which contains an array of block number or/and
2474 + array of block number pairs. That element called blocknr_set_entry is used
2475 + to store block numbers from the beginning and for extents from the end of
2476 + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2477 + count numbers of blocks and extents.
2478 +
2479 + +------------------- blocknr_set_entry->data ------------------+
2480 + |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2481 + +------------------------------------------------------------+
2482 +
2483 + When current blocknr_set_entry is full, allocate a new one. */
2484 +
2485 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2486 + * set (single blocks and block extents), in that case blocknr pair represent an
2487 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2488 + * there represent a (real block) -> (wandered block) mapping. */
2489 +
2490 +typedef struct blocknr_pair blocknr_pair;
2491 +
2492 +/* The total size of a blocknr_set_entry. */
2493 +#define BLOCKNR_SET_ENTRY_SIZE 128
2494 +
2495 +/* The number of blocks that can fit the blocknr data area. */
2496 +#define BLOCKNR_SET_ENTRIES_NUMBER \
2497 + ((BLOCKNR_SET_ENTRY_SIZE - \
2498 + 2 * sizeof (unsigned) - \
2499 + sizeof(struct list_head)) / \
2500 + sizeof(reiser4_block_nr))
2501 +
2502 +/* An entry of the blocknr_set */
2503 +struct blocknr_set_entry {
2504 + unsigned nr_singles;
2505 + unsigned nr_pairs;
2506 + struct list_head link;
2507 + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2508 +};
2509 +
2510 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
2511 +struct blocknr_pair {
2512 + reiser4_block_nr a;
2513 + reiser4_block_nr b;
2514 +};
2515 +
2516 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
2517 +/* Audited by: green(2002.06.11) */
2518 +static unsigned bse_avail(blocknr_set_entry * bse)
2519 +{
2520 + unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2521 +
2522 + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2523 + cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2524 +
2525 + return BLOCKNR_SET_ENTRIES_NUMBER - used;
2526 +}
2527 +
2528 +/* Initialize a blocknr_set_entry. */
2529 +static void bse_init(blocknr_set_entry *bse)
2530 +{
2531 + bse->nr_singles = 0;
2532 + bse->nr_pairs = 0;
2533 + INIT_LIST_HEAD(&bse->link);
2534 +}
2535 +
2536 +/* Allocate and initialize a blocknr_set_entry. */
2537 +/* Audited by: green(2002.06.11) */
2538 +static blocknr_set_entry *bse_alloc(void)
2539 +{
2540 + blocknr_set_entry *e;
2541 +
2542 + if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2543 + get_gfp_mask())) == NULL)
2544 + return NULL;
2545 +
2546 + bse_init(e);
2547 +
2548 + return e;
2549 +}
2550 +
2551 +/* Free a blocknr_set_entry. */
2552 +/* Audited by: green(2002.06.11) */
2553 +static void bse_free(blocknr_set_entry * bse)
2554 +{
2555 + kfree(bse);
2556 +}
2557 +
2558 +/* Add a block number to a blocknr_set_entry */
2559 +/* Audited by: green(2002.06.11) */
2560 +static void
2561 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2562 +{
2563 + assert("jmacd-5099", bse_avail(bse) >= 1);
2564 +
2565 + bse->entries[bse->nr_singles++] = *block;
2566 +}
2567 +
2568 +/* Get a pair of block numbers */
2569 +/* Audited by: green(2002.06.11) */
2570 +static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2571 +{
2572 + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2573 +
2574 + return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2575 + 2 * (pno + 1));
2576 +}
2577 +
2578 +/* Add a pair of block numbers to a blocknr_set_entry */
2579 +/* Audited by: green(2002.06.11) */
2580 +static void
2581 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2582 + const reiser4_block_nr * b)
2583 +{
2584 + blocknr_pair *pair;
2585 +
2586 + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2587 +
2588 + pair = bse_get_pair(bse, bse->nr_pairs++);
2589 +
2590 + pair->a = *a;
2591 + pair->b = *b;
2592 +}
2593 +
2594 +/* Add either a block or pair of blocks to the block number set. The first
2595 + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2596 + @b is non-NULL a pair is added. The block number set belongs to atom, and
2597 + the call is made with the atom lock held. There may not be enough space in
2598 + the current blocknr_set_entry. If new_bsep points to a non-NULL
2599 + blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2600 + will be set to NULL. If new_bsep contains NULL then the atom lock will be
2601 + released and a new bse will be allocated in new_bsep. E_REPEAT will be
2602 + returned with the atom unlocked for the operation to be tried again. If
2603 + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2604 + used during the call, it will be freed automatically. */
2605 +static int blocknr_set_add(txn_atom *atom, blocknr_set *bset,
2606 + blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2607 + const reiser4_block_nr *b)
2608 +{
2609 + blocknr_set_entry *bse;
2610 + unsigned entries_needed;
2611 +
2612 + assert("jmacd-5101", a != NULL);
2613 +
2614 + entries_needed = (b == NULL) ? 1 : 2;
2615 + if (list_empty(&bset->entries) ||
2616 + bse_avail(list_entry(bset->entries.next, blocknr_set_entry, link)) < entries_needed) {
2617 + /* See if a bse was previously allocated. */
2618 + if (*new_bsep == NULL) {
2619 + spin_unlock_atom(atom);
2620 + *new_bsep = bse_alloc();
2621 + return (*new_bsep != NULL) ? -E_REPEAT :
2622 + RETERR(-ENOMEM);
2623 + }
2624 +
2625 + /* Put it on the head of the list. */
2626 + list_add(&((*new_bsep)->link), &bset->entries);
2627 +
2628 + *new_bsep = NULL;
2629 + }
2630 +
2631 + /* Add the single or pair. */
2632 + bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2633 + if (b == NULL) {
2634 + bse_put_single(bse, a);
2635 + } else {
2636 + bse_put_pair(bse, a, b);
2637 + }
2638 +
2639 + /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2640 + if (*new_bsep != NULL) {
2641 + bse_free(*new_bsep);
2642 + *new_bsep = NULL;
2643 + }
2644 +
2645 + return 0;
2646 +}
2647 +
2648 +/* Add an extent to the block set. If the length is 1, it is treated as a
2649 + single block (e.g., reiser4_set_add_block). */
2650 +/* Audited by: green(2002.06.11) */
2651 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2652 + kmalloc might schedule. The only exception is atom spinlock, which is
2653 + properly freed. */
2654 +int
2655 +blocknr_set_add_extent(txn_atom * atom,
2656 + blocknr_set * bset,
2657 + blocknr_set_entry ** new_bsep,
2658 + const reiser4_block_nr * start,
2659 + const reiser4_block_nr * len)
2660 +{
2661 + assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2662 + return blocknr_set_add(atom, bset, new_bsep, start,
2663 + *len == 1 ? NULL : len);
2664 +}
2665 +
2666 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2667 + * by an assertion that both arguments are not null.*/
2668 +/* Audited by: green(2002.06.11) */
2669 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2670 + kmalloc might schedule. The only exception is atom spinlock, which is
2671 + properly freed. */
2672 +int
2673 +blocknr_set_add_pair(txn_atom * atom,
2674 + blocknr_set * bset,
2675 + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2676 + const reiser4_block_nr * b)
2677 +{
2678 + assert("jmacd-5103", a != NULL && b != NULL);
2679 + return blocknr_set_add(atom, bset, new_bsep, a, b);
2680 +}
2681 +
2682 +/* Initialize a blocknr_set. */
2683 +void blocknr_set_init(blocknr_set *bset)
2684 +{
2685 + INIT_LIST_HEAD(&bset->entries);
2686 +}
2687 +
2688 +/* Release the entries of a blocknr_set. */
2689 +void blocknr_set_destroy(blocknr_set *bset)
2690 +{
2691 + blocknr_set_entry *bse;
2692 +
2693 + while (!list_empty_careful(&bset->entries)) {
2694 + bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2695 + list_del_init(&bse->link);
2696 + bse_free(bse);
2697 + }
2698 +}
2699 +
2700 +/* Merge blocknr_set entries out of @from into @into. */
2701 +/* Audited by: green(2002.06.11) */
2702 +/* Auditor comments: This merge does not know if merged sets contain
2703 + blocks pairs (As for wandered sets) or extents, so it cannot really merge
2704 + overlapping ranges if there is some. So I believe it may lead to
2705 + some blocks being presented several times in one blocknr_set. To help
2706 + debugging such problems it might help to check for duplicate entries on
2707 + actual processing of this set. Testing this kind of stuff right here is
2708 + also complicated by the fact that these sets are not sorted and going
2709 + through whole set on each element addition is going to be CPU-heavy task */
2710 +void blocknr_set_merge(blocknr_set * from, blocknr_set * into)
2711 +{
2712 + blocknr_set_entry *bse_into = NULL;
2713 +
2714 + /* If @from is empty, no work to perform. */
2715 + if (list_empty_careful(&from->entries)) {
2716 + return;
2717 + }
2718 +
2719 + /* If @into is not empty, try merging partial-entries. */
2720 + if (!list_empty_careful(&into->entries)) {
2721 +
2722 + /* Neither set is empty, pop the front to members and try to combine them. */
2723 + blocknr_set_entry *bse_from;
2724 + unsigned into_avail;
2725 +
2726 + bse_into = list_entry(into->entries.next, blocknr_set_entry, link);
2727 + list_del_init(&bse_into->link);
2728 + bse_from = list_entry(from->entries.next, blocknr_set_entry, link);
2729 + list_del_init(&bse_from->link);
2730 +
2731 + /* Combine singles. */
2732 + for (into_avail = bse_avail(bse_into);
2733 + into_avail != 0 && bse_from->nr_singles != 0;
2734 + into_avail -= 1) {
2735 + bse_put_single(bse_into,
2736 + &bse_from->entries[--bse_from->
2737 + nr_singles]);
2738 + }
2739 +
2740 + /* Combine pairs. */
2741 + for (; into_avail > 1 && bse_from->nr_pairs != 0;
2742 + into_avail -= 2) {
2743 + blocknr_pair *pair =
2744 + bse_get_pair(bse_from, --bse_from->nr_pairs);
2745 + bse_put_pair(bse_into, &pair->a, &pair->b);
2746 + }
2747 +
2748 + /* If bse_from is empty, delete it now. */
2749 + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2750 + bse_free(bse_from);
2751 + } else {
2752 + /* Otherwise, bse_into is full or nearly full (e.g.,
2753 + it could have one slot avail and bse_from has one
2754 + pair left). Push it back onto the list. bse_from
2755 + becomes bse_into, which will be the new partial. */
2756 + list_add(&bse_into->link, &into->entries);
2757 + bse_into = bse_from;
2758 + }
2759 + }
2760 +
2761 + /* Splice lists together. */
2762 + list_splice_init(&from->entries, into->entries.prev);
2763 +
2764 + /* Add the partial entry back to the head of the list. */
2765 + if (bse_into != NULL) {
2766 + list_add(&bse_into->link, &into->entries);
2767 + }
2768 +}
2769 +
2770 +/* Iterate over all blocknr set elements. */
2771 +int blocknr_set_iterator(txn_atom *atom, blocknr_set *bset,
2772 + blocknr_set_actor_f actor, void *data, int delete)
2773 +{
2774 +
2775 + blocknr_set_entry *entry;
2776 +
2777 + assert("zam-429", atom != NULL);
2778 + assert("zam-430", atom_is_protected(atom));
2779 + assert("zam-431", bset != 0);
2780 + assert("zam-432", actor != NULL);
2781 +
2782 + entry = list_entry(bset->entries.next, blocknr_set_entry, link);
2783 + while (&bset->entries != &entry->link) {
2784 + blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2785 + unsigned int i;
2786 + int ret;
2787 +
2788 + for (i = 0; i < entry->nr_singles; i++) {
2789 + ret = actor(atom, &entry->entries[i], NULL, data);
2790 +
2791 + /* We can't break a loop if delete flag is set. */
2792 + if (ret != 0 && !delete)
2793 + return ret;
2794 + }
2795 +
2796 + for (i = 0; i < entry->nr_pairs; i++) {
2797 + struct blocknr_pair *ab;
2798 +
2799 + ab = bse_get_pair(entry, i);
2800 +
2801 + ret = actor(atom, &ab->a, &ab->b, data);
2802 +
2803 + if (ret != 0 && !delete)
2804 + return ret;
2805 + }
2806 +
2807 + if (delete) {
2808 + list_del(&entry->link);
2809 + bse_free(entry);
2810 + }
2811 +
2812 + entry = tmp;
2813 + }
2814 +
2815 + return 0;
2816 +}
2817 +
2818 +/*
2819 + * Local variables:
2820 + * c-indentation-style: "K&R"
2821 + * mode-name: "LC"
2822 + * c-basic-offset: 8
2823 + * tab-width: 8
2824 + * fill-column: 79
2825 + * scroll-step: 1
2826 + * End:
2827 + */
2828 diff -puN /dev/null fs/reiser4/carry.c
2829 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
2830 +++ linux-2.6.16-3-vs/fs/reiser4/carry.c 2006-05-30 18:51:50.088561250 +0400
2831 @@ -0,0 +1,1381 @@
2832 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2833 +/* Functions to "carry" tree modification(s) upward. */
2834 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2835 + set of changes that need to be propagated to the next level. We manage
2836 + node locking such that any searches that collide with carrying are
2837 + restarted, from the root if necessary.
2838 +
2839 + Insertion of a new item may result in items being moved among nodes and
2840 + this requires the delimiting key to be updated at the least common parent
2841 + of the nodes modified to preserve search tree invariants. Also, insertion
2842 + may require allocation of a new node. A pointer to the new node has to be
2843 + inserted into some node on the parent level, etc.
2844 +
2845 + Tree carrying is meant to be analogous to arithmetic carrying.
2846 +
2847 + A carry operation is always associated with some node (&carry_node).
2848 +
2849 + Carry process starts with some initial set of operations to be performed
2850 + and an initial set of already locked nodes. Operations are performed one
2851 + by one. Performing each single operation has following possible effects:
2852 +
2853 + - content of carry node associated with operation is modified
2854 + - new carry nodes are locked and involved into carry process on this level
2855 + - new carry operations are posted to the next level
2856 +
2857 + After all carry operations on this level are done, process is repeated for
2858 + the accumulated sequence on carry operations for the next level. This
2859 + starts by trying to lock (in left to right order) all carry nodes
2860 + associated with carry operations on the parent level. After this, we decide
2861 + whether more nodes are required on the left of already locked set. If so,
2862 + all locks taken on the parent level are released, new carry nodes are
2863 + added, and locking process repeats.
2864 +
2865 + It may happen that balancing process fails owing to unrecoverable error on
2866 + some of upper levels of a tree (possible causes are io error, failure to
2867 + allocate new node, etc.). In this case we should unmount the filesystem,
2868 + rebooting if it is the root, and possibly advise the use of fsck.
2869 +
2870 + USAGE:
2871 +
2872 + int some_tree_operation( znode *node, ... )
2873 + {
2874 + // Allocate on a stack pool of carry objects: operations and nodes.
2875 + // Most carry processes will only take objects from here, without
2876 + // dynamic allocation.
2877 +
2878 +I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2879 +
2880 + carry_pool pool;
2881 + carry_level lowest_level;
2882 + carry_op *op;
2883 +
2884 + init_carry_pool( &pool );
2885 + init_carry_level( &lowest_level, &pool );
2886 +
2887 + // operation may be one of:
2888 + // COP_INSERT --- insert new item into node
2889 + // COP_CUT --- remove part of or whole node
2890 + // COP_PASTE --- increase size of item
2891 + // COP_DELETE --- delete pointer from parent node
2892 + // COP_UPDATE --- update delimiting key in least
2893 + // common ancestor of two
2894 +
2895 + op = post_carry( &lowest_level, operation, node, 0 );
2896 + if( IS_ERR( op ) || ( op == NULL ) ) {
2897 + handle error
2898 + } else {
2899 + // fill in remaining fields in @op, according to carry.h:carry_op
2900 + result = carry( &lowest_level, NULL );
2901 + }
2902 + done_carry_pool( &pool );
2903 + }
2904 +
2905 + When you are implementing node plugin method that participates in carry
2906 + (shifting, insertion, deletion, etc.), do the following:
2907 +
2908 + int foo_node_method( znode *node, ..., carry_level *todo )
2909 + {
2910 + carry_op *op;
2911 +
2912 + ....
2913 +
2914 + // note, that last argument to post_carry() is non-null
2915 + // here, because @op is to be applied to the parent of @node, rather
2916 + // than to the @node itself as in the previous case.
2917 +
2918 + op = node_post_carry( todo, operation, node, 1 );
2919 + // fill in remaining fields in @op, according to carry.h:carry_op
2920 +
2921 + ....
2922 +
2923 + }
2924 +
2925 + BATCHING:
2926 +
2927 + One of the main advantages of level-by-level balancing implemented here is
2928 + ability to batch updates on a parent level and to peform them more
2929 + efficiently as a result.
2930 +
2931 + Description To Be Done (TBD).
2932 +
2933 + DIFFICULTIES AND SUBTLE POINTS:
2934 +
2935 + 1. complex plumbing is required, because:
2936 +
2937 + a. effective allocation through pools is needed
2938 +
2939 + b. target of operation is not exactly known when operation is
2940 + posted. This is worked around through bitfields in &carry_node and
2941 + logic in lock_carry_node()
2942 +
2943 + c. of interaction with locking code: node should be added into sibling
2944 + list when pointer to it is inserted into its parent, which is some time
2945 + after node was created. Between these moments, node is somewhat in
2946 + suspended state and is only registered in the carry lists
2947 +
2948 + 2. whole balancing logic is implemented here, in particular, insertion
2949 + logic is coded in make_space().
2950 +
2951 + 3. special cases like insertion (add_tree_root()) or deletion
2952 + (kill_tree_root()) of tree root and morphing of paste into insert
2953 + (insert_paste()) have to be handled.
2954 +
2955 + 4. there is non-trivial interdependency between allocation of new nodes
2956 + and almost everything else. This is mainly due to the (1.c) above. I shall
2957 + write about this later.
2958 +
2959 +*/
2960 +
2961 +#include "forward.h"
2962 +#include "debug.h"
2963 +#include "key.h"
2964 +#include "coord.h"
2965 +#include "plugin/item/item.h"
2966 +#include "plugin/item/extent.h"
2967 +#include "plugin/node/node.h"
2968 +#include "jnode.h"
2969 +#include "znode.h"
2970 +#include "tree_mod.h"
2971 +#include "tree_walk.h"
2972 +#include "block_alloc.h"
2973 +#include "pool.h"
2974 +#include "tree.h"
2975 +#include "carry.h"
2976 +#include "carry_ops.h"
2977 +#include "super.h"
2978 +#include "reiser4.h"
2979 +
2980 +#include <linux/types.h>
2981 +
2982 +/* level locking/unlocking */
2983 +static int lock_carry_level(carry_level * level);
2984 +static void unlock_carry_level(carry_level * level, int failure);
2985 +static void done_carry_level(carry_level * level);
2986 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2987 +
2988 +int lock_carry_node(carry_level * level, carry_node * node);
2989 +int lock_carry_node_tail(carry_node * node);
2990 +
2991 +/* carry processing proper */
2992 +static int carry_on_level(carry_level * doing, carry_level * todo);
2993 +
2994 +static carry_op *add_op(carry_level * level, pool_ordering order,
2995 + carry_op * reference);
2996 +
2997 +/* handlers for carry operations. */
2998 +
2999 +static void fatal_carry_error(carry_level * doing, int ecode);
3000 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
3001 +
3002 +
3003 +static void print_level(const char *prefix, carry_level * level);
3004 +
3005 +#if REISER4_DEBUG
3006 +typedef enum {
3007 + CARRY_TODO,
3008 + CARRY_DOING
3009 +} carry_queue_state;
3010 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
3011 +#endif
3012 +
3013 +/* main entry point for tree balancing.
3014 +
3015 + Tree carry performs operations from @doing and while doing so accumulates
3016 + information about operations to be performed on the next level ("carried"
3017 + to the parent level). Carried operations are performed, causing possibly
3018 + more operations to be carried upward etc. carry() takes care about
3019 + locking and pinning znodes while operating on them.
3020 +
3021 + For usage, see comment at the top of fs/reiser4/carry.c
3022 +
3023 +*/
3024 +int carry(carry_level * doing /* set of carry operations to be performed */ ,
3025 + carry_level * done /* set of nodes, already performed at the
3026 + * previous level. NULL in most cases */ )
3027 +{
3028 + int result = 0;
3029 + /* queue of new requests */
3030 + carry_level *todo;
3031 + ON_DEBUG(STORE_COUNTERS);
3032 +
3033 + assert("nikita-888", doing != NULL);
3034 + BUG_ON(done != NULL);
3035 +
3036 + todo = doing + 1;
3037 + init_carry_level(todo, doing->pool);
3038 +
3039 + /* queue of requests preformed on the previous level */
3040 + done = todo + 1;
3041 + init_carry_level(done, doing->pool);
3042 +
3043 + /* iterate until there is nothing more to do */
3044 + while (result == 0 && doing->ops_num > 0) {
3045 + carry_level *tmp;
3046 +
3047 + /* at this point @done is locked. */
3048 + /* repeat lock/do/unlock while
3049 +
3050 + (1) lock_carry_level() fails due to deadlock avoidance, or
3051 +
3052 + (2) carry_on_level() decides that more nodes have to
3053 + be involved.
3054 +
3055 + (3) some unexpected error occurred while balancing on the
3056 + upper levels. In this case all changes are rolled back.
3057 +
3058 + */
3059 + while (1) {
3060 + result = lock_carry_level(doing);
3061 + if (result == 0) {
3062 + /* perform operations from @doing and
3063 + accumulate new requests in @todo */
3064 + result = carry_on_level(doing, todo);
3065 + if (result == 0)
3066 + break;
3067 + else if (result != -E_REPEAT ||
3068 + !doing->restartable) {
3069 + warning("nikita-1043",
3070 + "Fatal error during carry: %i",
3071 + result);
3072 + print_level("done", done);
3073 + print_level("doing", doing);
3074 + print_level("todo", todo);
3075 + /* do some rough stuff like aborting
3076 + all pending transcrashes and thus
3077 + pushing tree back to the consistent
3078 + state. Alternatvely, just panic.
3079 + */
3080 + fatal_carry_error(doing, result);
3081 + return result;
3082 + }
3083 + } else if (result != -E_REPEAT) {
3084 + fatal_carry_error(doing, result);
3085 + return result;
3086 + }
3087 + unlock_carry_level(doing, 1);
3088 + }
3089 + /* at this point @done can be safely unlocked */
3090 + done_carry_level(done);
3091 +
3092 + /* cyclically shift queues */
3093 + tmp = done;
3094 + done = doing;
3095 + doing = todo;
3096 + todo = tmp;
3097 + init_carry_level(todo, doing->pool);
3098 +
3099 + /* give other threads chance to run */
3100 + preempt_point();
3101 + }
3102 + done_carry_level(done);
3103 +
3104 + /* all counters, but x_refs should remain the same. x_refs can change
3105 + owing to transaction manager */
3106 + ON_DEBUG(CHECK_COUNTERS);
3107 + return result;
3108 +}
3109 +
3110 +/* perform carry operations on given level.
3111 +
3112 + Optimizations proposed by pooh:
3113 +
3114 + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
3115 + required;
3116 +
3117 + (2) unlock node if there are no more operations to be performed upon it and
3118 + node didn't add any operation to @todo. This can be implemented by
3119 + attaching to each node two counters: counter of operaions working on this
3120 + node and counter and operations carried upward from this node.
3121 +
3122 +*/
3123 +static int carry_on_level(carry_level * doing /* queue of carry operations to
3124 + * do on this level */ ,
3125 + carry_level * todo /* queue where new carry
3126 + * operations to be performed on
3127 + * the * parent level are
3128 + * accumulated during @doing
3129 + * processing. */ )
3130 +{
3131 + int result;
3132 + int (*f) (carry_op *, carry_level *, carry_level *);
3133 + carry_op *op;
3134 + carry_op *tmp_op;
3135 +
3136 + assert("nikita-1034", doing != NULL);
3137 + assert("nikita-1035", todo != NULL);
3138 +
3139 + /* @doing->nodes are locked. */
3140 +
3141 + /* This function can be split into two phases: analysis and modification.
3142 +
3143 + Analysis calculates precisely what items should be moved between
3144 + nodes. This information is gathered in some structures attached to
3145 + each carry_node in a @doing queue. Analysis also determines whether
3146 + new nodes are to be allocated etc.
3147 +
3148 + After analysis is completed, actual modification is performed. Here
3149 + we can take advantage of "batch modification": if there are several
3150 + operations acting on the same node, modifications can be performed
3151 + more efficiently when batched together.
3152 +
3153 + Above is an optimization left for the future.
3154 + */
3155 + /* Important, but delayed optimization: it's possible to batch
3156 + operations together and perform them more efficiently as a
3157 + result. For example, deletion of several neighboring items from a
3158 + node can be converted to a single ->cut() operation.
3159 +
3160 + Before processing queue, it should be scanned and "mergeable"
3161 + operations merged.
3162 + */
3163 + result = 0;
3164 + for_all_ops(doing, op, tmp_op) {
3165 + carry_opcode opcode;
3166 +
3167 + assert("nikita-1041", op != NULL);
3168 + opcode = op->op;
3169 + assert("nikita-1042", op->op < COP_LAST_OP);
3170 + f = op_dispatch_table[op->op].handler;
3171 + result = f(op, doing, todo);
3172 + /* locking can fail with -E_REPEAT. Any different error is fatal
3173 + and will be handled by fatal_carry_error() sledgehammer.
3174 + */
3175 + if (result != 0)
3176 + break;
3177 + }
3178 + if (result == 0) {
3179 + carry_plugin_info info;
3180 + carry_node *scan;
3181 + carry_node *tmp_scan;
3182 +
3183 + info.doing = doing;
3184 + info.todo = todo;
3185 +
3186 + assert("nikita-3002",
3187 + carry_level_invariant(doing, CARRY_DOING));
3188 + for_all_nodes(doing, scan, tmp_scan) {
3189 + znode *node;
3190 +
3191 + node = carry_real(scan);
3192 + assert("nikita-2547", node != NULL);
3193 + if (node_is_empty(node)) {
3194 + result =
3195 + node_plugin_by_node(node)->
3196 + prepare_removal(node, &info);
3197 + if (result != 0)
3198 + break;
3199 + }
3200 + }
3201 + }
3202 + return result;
3203 +}
3204 +
3205 +/* post carry operation
3206 +
3207 + This is main function used by external carry clients: node layout plugins
3208 + and tree operations to create new carry operation to be performed on some
3209 + level.
3210 +
3211 + New operation will be included in the @level queue. To actually perform it,
3212 + call carry( level, ... ). This function takes write lock on @node. Carry
3213 + manages all its locks by itself, don't worry about this.
3214 +
3215 + This function adds operation and node at the end of the queue. It is up to
3216 + caller to guarantee proper ordering of node queue.
3217 +
3218 +*/
3219 +carry_op *post_carry(carry_level * level /* queue where new operation is to
3220 + * be posted at */ ,
3221 + carry_opcode op /* opcode of operation */ ,
3222 + znode * node /* node on which this operation
3223 + * will operate */ ,
3224 + int apply_to_parent_p /* whether operation will operate
3225 + * directly on @node or on it
3226 + * parent. */ )
3227 +{
3228 + carry_op *result;
3229 + carry_node *child;
3230 +
3231 + assert("nikita-1046", level != NULL);
3232 + assert("nikita-1788", znode_is_write_locked(node));
3233 +
3234 + result = add_op(level, POOLO_LAST, NULL);
3235 + if (IS_ERR(result))
3236 + return result;
3237 + child = add_carry(level, POOLO_LAST, NULL);
3238 + if (IS_ERR(child)) {
3239 + reiser4_pool_free(&level->pool->op_pool, &result->header);
3240 + return (carry_op *) child;
3241 + }
3242 + result->node = child;
3243 + result->op = op;
3244 + child->parent = apply_to_parent_p;
3245 + if (ZF_ISSET(node, JNODE_ORPHAN))
3246 + child->left_before = 1;
3247 + child->node = node;
3248 + return result;
3249 +}
3250 +
3251 +/* initialize carry queue */
3252 +void init_carry_level(carry_level * level /* level to initialize */ ,
3253 + carry_pool * pool /* pool @level will allocate objects
3254 + * from */ )
3255 +{
3256 + assert("nikita-1045", level != NULL);
3257 + assert("nikita-967", pool != NULL);
3258 +
3259 + memset(level, 0, sizeof *level);
3260 + level->pool = pool;
3261 +
3262 + INIT_LIST_HEAD(&level->nodes);
3263 + INIT_LIST_HEAD(&level->ops);
3264 +}
3265 +
3266 +/* allocate carry pool and initialize pools within queue */
3267 +carry_pool *init_carry_pool(int size)
3268 +{
3269 + carry_pool *pool;
3270 +
3271 + assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
3272 + pool = kmalloc(size, get_gfp_mask());
3273 + if (pool == NULL)
3274 + return ERR_PTR(RETERR(-ENOMEM));
3275 +
3276 + reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
3277 + (char *)pool->op);
3278 + reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
3279 + NODES_LOCKED_POOL_SIZE, (char *)pool->node);
3280 + return pool;
3281 +}
3282 +
3283 +/* finish with queue pools */
3284 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
3285 +{
3286 + reiser4_done_pool(&pool->op_pool);
3287 + reiser4_done_pool(&pool->node_pool);
3288 + kfree(pool);
3289 +}
3290 +
3291 +/* add new carry node to the @level.
3292 +
3293 + Returns pointer to the new carry node allocated from pool. It's up to
3294 + callers to maintain proper order in the @level. Assumption is that if carry
3295 + nodes on one level are already sorted and modifications are peroformed from
3296 + left to right, carry nodes added on the parent level will be ordered
3297 + automatically. To control ordering use @order and @reference parameters.
3298 +
3299 +*/
3300 +carry_node *add_carry_skip(carry_level * level /* &carry_level to add node
3301 + * to */ ,
3302 + pool_ordering order /* where to insert: at the
3303 + * beginning of @level,
3304 + * before @reference, after
3305 + * @reference, at the end
3306 + * of @level */ ,
3307 + carry_node * reference /* reference node for
3308 + * insertion */ )
3309 +{
3310 + ON_DEBUG(carry_node * orig_ref = reference);
3311 +
3312 + if (order == POOLO_BEFORE) {
3313 + reference = find_left_carry(reference, level);
3314 + if (reference == NULL)
3315 + reference = list_entry(level->nodes.next, carry_node,
3316 + header.level_linkage);
3317 + else
3318 + reference = list_entry(reference->header.level_linkage.next,
3319 + carry_node, header.level_linkage);
3320 + } else if (order == POOLO_AFTER) {
3321 + reference = find_right_carry(reference, level);
3322 + if (reference == NULL)
3323 + reference = list_entry(level->nodes.prev, carry_node,
3324 + header.level_linkage);
3325 + else
3326 + reference = list_entry(reference->header.level_linkage.prev,
3327 + carry_node, header.level_linkage);
3328 + }
3329 + assert("nikita-2209",
3330 + ergo(orig_ref != NULL,
3331 + carry_real(reference) == carry_real(orig_ref)));
3332 + return add_carry(level, order, reference);
3333 +}
3334 +
3335 +carry_node *add_carry(carry_level * level /* &carry_level to add node
3336 + * to */ ,
3337 + pool_ordering order /* where to insert: at the
3338 + * beginning of @level, before
3339 + * @reference, after @reference,
3340 + * at the end of @level */ ,
3341 + carry_node * reference /* reference node for
3342 + * insertion */ )
3343 +{
3344 + carry_node *result;
3345 +
3346 + result =
3347 + (carry_node *) add_obj(&level->pool->node_pool, &level->nodes,
3348 + order, &reference->header);
3349 + if (!IS_ERR(result) && (result != NULL))
3350 + ++level->nodes_num;
3351 + return result;
3352 +}
3353 +
3354 +/* add new carry operation to the @level.
3355 +
3356 + Returns pointer to the new carry operations allocated from pool. It's up to
3357 + callers to maintain proper order in the @level. To control ordering use
3358 + @order and @reference parameters.
3359 +
3360 +*/
3361 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
3362 + pool_ordering order /* where to insert: at the beginning of
3363 + * @level, before @reference, after
3364 + * @reference, at the end of @level */ ,
3365 + carry_op *
3366 + reference /* reference node for insertion */ )
3367 +{
3368 + carry_op *result;
3369 +
3370 + result =
3371 + (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order,
3372 + &reference->header);
3373 + if (!IS_ERR(result) && (result != NULL))
3374 + ++level->ops_num;
3375 + return result;
3376 +}
3377 +
3378 +/* Return node on the right of which @node was created.
3379 +
3380 + Each node is created on the right of some existing node (or it is new root,
3381 + which is special case not handled here).
3382 +
3383 + @node is new node created on some level, but not yet inserted into its
3384 + parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3385 +
3386 +*/
3387 +static carry_node *find_begetting_brother(carry_node * node /* node to start search
3388 + * from */ ,
3389 + carry_level * kin UNUSED_ARG /* level to
3390 + * scan */ )
3391 +{
3392 + carry_node *scan;
3393 +
3394 + assert("nikita-1614", node != NULL);
3395 + assert("nikita-1615", kin != NULL);
3396 + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3397 + assert("nikita-1619", ergo(carry_real(node) != NULL,
3398 + ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
3399 +
3400 + for (scan = node;;
3401 + scan = list_entry(scan->header.level_linkage.prev, carry_node,
3402 + header.level_linkage)) {
3403 + assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3404 + if ((scan->node != node->node) &&
3405 + !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3406 + assert("nikita-1618", carry_real(scan) != NULL);
3407 + break;
3408 + }
3409 + }
3410 + return scan;
3411 +}
3412 +
3413 +static cmp_t
3414 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3415 +{
3416 + assert("nikita-2199", n1 != NULL);
3417 + assert("nikita-2200", n2 != NULL);
3418 +
3419 + if (n1 == n2)
3420 + return EQUAL_TO;
3421 + while (1) {
3422 + n1 = carry_node_next(n1);
3423 + if (carry_node_end(level, n1))
3424 + return GREATER_THAN;
3425 + if (n1 == n2)
3426 + return LESS_THAN;
3427 + }
3428 + impossible("nikita-2201", "End of level reached");
3429 +}
3430 +
3431 +carry_node *find_carry_node(carry_level * level, const znode * node)
3432 +{
3433 + carry_node *scan;
3434 + carry_node *tmp_scan;
3435 +
3436 + assert("nikita-2202", level != NULL);
3437 + assert("nikita-2203", node != NULL);
3438 +
3439 + for_all_nodes(level, scan, tmp_scan) {
3440 + if (carry_real(scan) == node)
3441 + return scan;
3442 + }
3443 + return NULL;
3444 +}
3445 +
3446 +znode *carry_real(const carry_node * node)
3447 +{
3448 + assert("nikita-3061", node != NULL);
3449 +
3450 + return node->lock_handle.node;
3451 +}
3452 +
3453 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3454 + const znode * node)
3455 +{
3456 + carry_node *base;
3457 + carry_node *scan;
3458 + carry_node *tmp_scan;
3459 + carry_node *proj;
3460 +
3461 + base = find_carry_node(doing, node);
3462 + assert("nikita-2204", base != NULL);
3463 +
3464 + for_all_nodes(todo, scan, tmp_scan) {
3465 + proj = find_carry_node(doing, scan->node);
3466 + assert("nikita-2205", proj != NULL);
3467 + if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3468 + break;
3469 + }
3470 + return scan;
3471 +}
3472 +
3473 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3474 + znode * node)
3475 +{
3476 + carry_node *reference;
3477 +
3478 + assert("nikita-2994", doing != NULL);
3479 + assert("nikita-2995", todo != NULL);
3480 + assert("nikita-2996", node != NULL);
3481 +
3482 + reference = insert_carry_node(doing, todo, node);
3483 + assert("nikita-2997", reference != NULL);
3484 +
3485 + return add_carry(todo, POOLO_BEFORE, reference);
3486 +}
3487 +
3488 +/* like post_carry(), but designed to be called from node plugin methods.
3489 + This function is different from post_carry() in that it finds proper place
3490 + to insert node in the queue. */
3491 +carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
3492 + * passed down to node
3493 + * plugin */ ,
3494 + carry_opcode op /* opcode of operation */ ,
3495 + znode * node /* node on which this
3496 + * operation will operate */ ,
3497 + int apply_to_parent_p /* whether operation will
3498 + * operate directly on @node
3499 + * or on it parent. */ )
3500 +{
3501 + carry_op *result;
3502 + carry_node *child;
3503 +
3504 + assert("nikita-2207", info != NULL);
3505 + assert("nikita-2208", info->todo != NULL);
3506 +
3507 + if (info->doing == NULL)
3508 + return post_carry(info->todo, op, node, apply_to_parent_p);
3509 +
3510 + result = add_op(info->todo, POOLO_LAST, NULL);
3511 + if (IS_ERR(result))
3512 + return result;
3513 + child = add_carry_atplace(info->doing, info->todo, node);
3514 + if (IS_ERR(child)) {
3515 + reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3516 + return (carry_op *) child;
3517 + }
3518 + result->node = child;
3519 + result->op = op;
3520 + child->parent = apply_to_parent_p;
3521 + if (ZF_ISSET(node, JNODE_ORPHAN))
3522 + child->left_before = 1;
3523 + child->node = node;
3524 + return result;
3525 +}
3526 +
3527 +/* lock all carry nodes in @level */
3528 +static int lock_carry_level(carry_level * level /* level to lock */ )
3529 +{
3530 + int result;
3531 + carry_node *node;
3532 + carry_node *tmp_node;
3533 +
3534 + assert("nikita-881", level != NULL);
3535 + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3536 +
3537 + /* lock nodes from left to right */
3538 + result = 0;
3539 + for_all_nodes(level, node, tmp_node) {
3540 + result = lock_carry_node(level, node);
3541 + if (result != 0)
3542 + break;
3543 + }
3544 + return result;
3545 +}
3546 +
3547 +/* Synchronize delimiting keys between @node and its left neighbor.
3548 +
3549 + To reduce contention on dk key and simplify carry code, we synchronize
3550 + delimiting keys only when carry ultimately leaves tree level (carrying
3551 + changes upward) and unlocks nodes at this level.
3552 +
3553 + This function first finds left neighbor of @node and then updates left
3554 + neighbor's right delimiting key to conincide with least key in @node.
3555 +
3556 +*/
3557 +
3558 +ON_DEBUG(extern atomic_t delim_key_version;
3559 + )
3560 +
3561 +static void sync_dkeys(znode * spot /* node to update */ )
3562 +{
3563 + reiser4_key pivot;
3564 + reiser4_tree *tree;
3565 +
3566 + assert("nikita-1610", spot != NULL);
3567 + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3568 +
3569 + tree = znode_get_tree(spot);
3570 + read_lock_tree(tree);
3571 + write_lock_dk(tree);
3572 +
3573 + assert("nikita-2192", znode_is_loaded(spot));
3574 +
3575 + /* sync left delimiting key of @spot with key in its leftmost item */
3576 + if (node_is_empty(spot))
3577 + pivot = *znode_get_rd_key(spot);
3578 + else
3579 + leftmost_key_in_node(spot, &pivot);
3580 +
3581 + znode_set_ld_key(spot, &pivot);
3582 +
3583 + /* there can be sequence of empty nodes pending removal on the left of
3584 + @spot. Scan them and update their left and right delimiting keys to
3585 + match left delimiting key of @spot. Also, update right delimiting
3586 + key of first non-empty left neighbor.
3587 + */
3588 + while (1) {
3589 + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3590 + break;
3591 +
3592 + spot = spot->left;
3593 + if (spot == NULL)
3594 + break;
3595 +
3596 + znode_set_rd_key(spot, &pivot);
3597 + /* don't sink into the domain of another balancing */
3598 + if (!znode_is_write_locked(spot))
3599 + break;
3600 + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3601 + znode_set_ld_key(spot, &pivot);
3602 + else
3603 + break;
3604 + }
3605 +
3606 + write_unlock_dk(tree);
3607 + read_unlock_tree(tree);
3608 +}
3609 +
3610 +/* unlock all carry nodes in @level */
3611 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3612 + int failure /* true if unlocking owing to
3613 + * failure */ )
3614 +{
3615 + carry_node *node;
3616 + carry_node *tmp_node;
3617 +
3618 + assert("nikita-889", level != NULL);
3619 +
3620 + if (!failure) {
3621 + znode *spot;
3622 +
3623 + spot = NULL;
3624 + /* update delimiting keys */
3625 + for_all_nodes(level, node, tmp_node) {
3626 + if (carry_real(node) != spot) {
3627 + spot = carry_real(node);
3628 + sync_dkeys(spot);
3629 + }
3630 + }
3631 + }
3632 +
3633 + /* nodes can be unlocked in arbitrary order. In preemptible
3634 + environment it's better to unlock in reverse order of locking,
3635 + though.
3636 + */
3637 + for_all_nodes_back(level, node, tmp_node) {
3638 + /* all allocated nodes should be already linked to their
3639 + parents at this moment. */
3640 + assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
3641 + JNODE_ORPHAN)));
3642 + ON_DEBUG(check_dkeys(carry_real(node)));
3643 + unlock_carry_node(level, node, failure);
3644 + }
3645 + level->new_root = NULL;
3646 +}
3647 +
3648 +/* finish with @level
3649 +
3650 + Unlock nodes and release all allocated resources */
3651 +static void done_carry_level(carry_level * level /* level to finish */ )
3652 +{
3653 + carry_node *node;
3654 + carry_node *tmp_node;
3655 + carry_op *op;
3656 + carry_op *tmp_op;
3657 +
3658 + assert("nikita-1076", level != NULL);
3659 +
3660 + unlock_carry_level(level, 0);
3661 + for_all_nodes(level, node, tmp_node) {
3662 + assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3663 + assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3664 + reiser4_pool_free(&level->pool->node_pool, &node->header);
3665 + }
3666 + for_all_ops(level, op, tmp_op)
3667 + reiser4_pool_free(&level->pool->op_pool, &op->header);
3668 +}
3669 +
3670 +/* helper function to complete locking of carry node
3671 +
3672 + Finish locking of carry node. There are several ways in which new carry
3673 + node can be added into carry level and locked. Normal is through
3674 + lock_carry_node(), but also from find_{left|right}_neighbor(). This
3675 + function factors out common final part of all locking scenarios. It
3676 + supposes that @node -> lock_handle is lock handle for lock just taken and
3677 + fills ->real_node from this lock handle.
3678 +
3679 +*/
3680 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3681 +{
3682 + assert("nikita-1052", node != NULL);
3683 + assert("nikita-1187", carry_real(node) != NULL);
3684 + assert("nikita-1188", !node->unlock);
3685 +
3686 + node->unlock = 1;
3687 + /* Load node content into memory and install node plugin by
3688 + looking at the node header.
3689 +
3690 + Most of the time this call is cheap because the node is
3691 + already in memory.
3692 +
3693 + Corresponding zrelse() is in unlock_carry_node()
3694 + */
3695 + return zload(carry_real(node));
3696 +}
3697 +
3698 +/* lock carry node
3699 +
3700 + "Resolve" node to real znode, lock it and mark as locked.
3701 + This requires recursive locking of znodes.
3702 +
3703 + When operation is posted to the parent level, node it will be applied to is
3704 + not yet known. For example, when shifting data between two nodes,
3705 + delimiting has to be updated in parent or parents of nodes involved. But
3706 + their parents is not yet locked and, moreover said nodes can be reparented
3707 + by concurrent balancing.
3708 +
3709 + To work around this, carry operation is applied to special "carry node"
3710 + rather than to the znode itself. Carry node consists of some "base" or
3711 + "reference" znode and flags indicating how to get to the target of carry
3712 + operation (->real_node field of carry_node) from base.
3713 +
3714 +*/
3715 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3716 + carry_node * node /* node to lock */ )
3717 +{
3718 + int result;
3719 + znode *reference_point;
3720 + lock_handle lh;
3721 + lock_handle tmp_lh;
3722 + reiser4_tree *tree;
3723 +
3724 + assert("nikita-887", level != NULL);
3725 + assert("nikita-882", node != NULL);
3726 +
3727 + result = 0;
3728 + reference_point = node->node;
3729 + init_lh(&lh);
3730 + init_lh(&tmp_lh);
3731 + if (node->left_before) {
3732 + /* handling of new nodes, allocated on the previous level:
3733 +
3734 + some carry ops were propably posted from the new node, but
3735 + this node neither has parent pointer set, nor is
3736 + connected. This will be done in ->create_hook() for
3737 + internal item.
3738 +
3739 + No then less, parent of new node has to be locked. To do
3740 + this, first go to the "left" in the carry order. This
3741 + depends on the decision to always allocate new node on the
3742 + right of existing one.
3743 +
3744 + Loop handles case when multiple nodes, all orphans, were
3745 + inserted.
3746 +
3747 + Strictly speaking, taking tree lock is not necessary here,
3748 + because all nodes scanned by loop in
3749 + find_begetting_brother() are write-locked by this thread,
3750 + and thus, their sibling linkage cannot change.
3751 +
3752 + */
3753 + tree = znode_get_tree(reference_point);
3754 + read_lock_tree(tree);
3755 + reference_point = find_begetting_brother(node, level)->node;
3756 + read_unlock_tree(tree);
3757 + assert("nikita-1186", reference_point != NULL);
3758 + }
3759 + if (node->parent && (result == 0)) {
3760 + result =
3761 + reiser4_get_parent(&tmp_lh, reference_point,
3762 + ZNODE_WRITE_LOCK);
3763 + if (result != 0) {
3764 + ; /* nothing */
3765 + } else if (znode_get_level(tmp_lh.node) == 0) {
3766 + assert("nikita-1347", znode_above_root(tmp_lh.node));
3767 + result = add_new_root(level, node, tmp_lh.node);
3768 + if (result == 0) {
3769 + reference_point = level->new_root;
3770 + move_lh(&lh, &node->lock_handle);
3771 + }
3772 + } else if ((level->new_root != NULL)
3773 + && (level->new_root !=
3774 + znode_parent_nolock(reference_point))) {
3775 + /* parent of node exists, but this level aready
3776 + created different new root, so */
3777 + warning("nikita-1109",
3778 + /* it should be "radicis", but tradition is
3779 + tradition. do banshees read latin? */
3780 + "hodie natus est radici frater");
3781 + result = -EIO;
3782 + } else {
3783 + move_lh(&lh, &tmp_lh);
3784 + reference_point = lh.node;
3785 + }
3786 + }
3787 + if (node->left && (result == 0)) {
3788 + assert("nikita-1183", node->parent);
3789 + assert("nikita-883", reference_point != NULL);
3790 + result =
3791 + reiser4_get_left_neighbor(&tmp_lh, reference_point,
3792 + ZNODE_WRITE_LOCK,
3793 + GN_CAN_USE_UPPER_LEVELS);
3794 + if (result == 0) {
3795 + done_lh(&lh);
3796 + move_lh(&lh, &tmp_lh);
3797 + reference_point = lh.node;
3798 + }
3799 + }
3800 + if (!node->parent && !node->left && !node->left_before) {
3801 + result =
3802 + longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3803 + ZNODE_LOCK_HIPRI);
3804 + }
3805 + if (result == 0) {
3806 + move_lh(&node->lock_handle, &lh);
3807 + result = lock_carry_node_tail(node);
3808 + }
3809 + done_lh(&tmp_lh);
3810 + done_lh(&lh);
3811 + return result;
3812 +}
3813 +
3814 +/* release a lock on &carry_node.
3815 +
3816 + Release if necessary lock on @node. This opearion is pair of
3817 + lock_carry_node() and is idempotent: you can call it more than once on the
3818 + same node.
3819 +
3820 +*/
3821 +static void
3822 +unlock_carry_node(carry_level * level,
3823 + carry_node * node /* node to be released */ ,
3824 + int failure /* 0 if node is unlocked due
3825 + * to some error */ )
3826 +{
3827 + znode *real_node;
3828 +
3829 + assert("nikita-884", node != NULL);
3830 +
3831 + real_node = carry_real(node);
3832 + /* pair to zload() in lock_carry_node_tail() */
3833 + zrelse(real_node);
3834 + if (node->unlock && (real_node != NULL)) {
3835 + assert("nikita-899", real_node == node->lock_handle.node);
3836 + longterm_unlock_znode(&node->lock_handle);
3837 + }
3838 + if (failure) {
3839 + if (node->deallocate && (real_node != NULL)) {
3840 + /* free node in bitmap
3841 +
3842 + Prepare node for removal. Last zput() will finish
3843 + with it.
3844 + */
3845 + ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3846 + }
3847 + if (node->free) {
3848 + assert("nikita-2177",
3849 + list_empty_careful(&node->lock_handle.locks_link));
3850 + assert("nikita-2112",
3851 + list_empty_careful(&node->lock_handle.owners_link));
3852 + reiser4_pool_free(&level->pool->node_pool,
3853 + &node->header);
3854 + }
3855 + }
3856 +}
3857 +
3858 +/* fatal_carry_error() - all-catching error handling function
3859 +
3860 + It is possible that carry faces unrecoverable error, like unability to
3861 + insert pointer at the internal level. Our simple solution is just panic in
3862 + this situation. More sophisticated things like attempt to remount
3863 + file-system as read-only can be implemented without much difficlties.
3864 +
3865 + It is believed, that:
3866 +
3867 + 1. in stead of panicking, all current transactions can be aborted rolling
3868 + system back to the consistent state.
3869 +
3870 +Umm, if you simply panic without doing anything more at all, then all current
3871 +transactions are aborted and the system is rolled back to a consistent state,
3872 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3873 +precise. If an internal node is corrupted on disk due to hardware failure,
3874 +then there may be no consistent state that can be rolled back to, so instead
3875 +we should say that it will rollback the transactions, which barring other
3876 +factors means rolling back to a consistent state.
3877 +
3878 +# Nikita: there is a subtle difference between panic and aborting
3879 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3880 +# don't using reiser4 (not that we care about such processes), or using other
3881 +# reiser4 mounts (about them we do care) will simply continue to run. With
3882 +# some luck, even application using aborted file system can survive: it will
3883 +# get some error, like EBADF, from each file descriptor on failed file system,
3884 +# but applications that do care about tolerance will cope with this (squid
3885 +# will).
3886 +
3887 +It would be a nice feature though to support rollback without rebooting
3888 +followed by remount, but this can wait for later versions.
3889 +
3890 + 2. once isolated transactions will be implemented it will be possible to
3891 + roll back offending transaction.
3892 +
3893 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3894 +it more before deciding if it should be done. -Hans
3895 +
3896 +*/
3897 +static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3898 + * where
3899 + * unrecoverable
3900 + * error
3901 + * occurred */ ,
3902 + int ecode /* error code */ )
3903 +{
3904 + assert("nikita-1230", doing != NULL);
3905 + assert("nikita-1231", ecode < 0);
3906 +
3907 + reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3908 +}
3909 +
3910 +/* add new root to the tree
3911 +
3912 + This function itself only manages changes in carry structures and delegates
3913 + all hard work (allocation of znode for new root, changes of parent and
3914 + sibling pointers to the add_tree_root().
3915 +
3916 + Locking: old tree root is locked by carry at this point. Fake znode is also
3917 + locked.
3918 +
3919 +*/
3920 +static int add_new_root(carry_level * level /* carry level in context of which
3921 + * operation is performed */ ,
3922 + carry_node * node /* carry node for existing root */ ,
3923 + znode * fake /* "fake" znode already locked by
3924 + * us */ )
3925 +{
3926 + int result;
3927 +
3928 + assert("nikita-1104", level != NULL);
3929 + assert("nikita-1105", node != NULL);
3930 +
3931 + assert("nikita-1403", znode_is_write_locked(node->node));
3932 + assert("nikita-1404", znode_is_write_locked(fake));
3933 +
3934 + /* trying to create new root. */
3935 + /* @node is root and it's already locked by us. This
3936 + means that nobody else can be trying to add/remove
3937 + tree root right now.
3938 + */
3939 + if (level->new_root == NULL)
3940 + level->new_root = add_tree_root(node->node, fake);
3941 + if (!IS_ERR(level->new_root)) {
3942 + assert("nikita-1210", znode_is_root(level->new_root));
3943 + node->deallocate = 1;
3944 + result =
3945 + longterm_lock_znode(&node->lock_handle, level->new_root,
3946 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3947 + if (result == 0)
3948 + zput(level->new_root);
3949 + } else {
3950 + result = PTR_ERR(level->new_root);
3951 + level->new_root = NULL;
3952 + }
3953 + return result;
3954 +}
3955 +
3956 +/* allocate new znode and add the operation that inserts the
3957 + pointer to it into the parent node into the todo level
3958 +
3959 + Allocate new znode, add it into carry queue and post into @todo queue
3960 + request to add pointer to new node into its parent.
3961 +
3962 + This is carry related routing that calls new_node() to allocate new
3963 + node.
3964 +*/
3965 +carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3966 + * node */ ,
3967 + carry_node * ref /* carry node after which new
3968 + * carry node is to be inserted
3969 + * into queue. This affects
3970 + * locking. */ ,
3971 + carry_level * doing /* carry queue where new node is
3972 + * to be added */ ,
3973 + carry_level * todo /* carry queue where COP_INSERT
3974 + * operation to add pointer to
3975 + * new node will ne added */ )
3976 +{
3977 + carry_node *fresh;
3978 + znode *new_znode;
3979 + carry_op *add_pointer;
3980 + carry_plugin_info info;
3981 +
3982 + assert("nikita-1048", brother != NULL);
3983 + assert("nikita-1049", todo != NULL);
3984 +
3985 + /* There is a lot of possible variations here: to what parent
3986 + new node will be attached and where. For simplicity, always
3987 + do the following:
3988 +
3989 + (1) new node and @brother will have the same parent.
3990 +
3991 + (2) new node is added on the right of @brother
3992 +
3993 + */
3994 +
3995 + fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
3996 + if (IS_ERR(fresh))
3997 + return fresh;
3998 +
3999 + fresh->deallocate = 1;
4000 + fresh->free = 1;
4001 +
4002 + new_znode = new_node(brother, znode_get_level(brother));
4003 + if (IS_ERR(new_znode))
4004 + /* @fresh will be deallocated automatically by error
4005 + handling code in the caller. */
4006 + return (carry_node *) new_znode;
4007 +
4008 + /* new_znode returned znode with x_count 1. Caller has to decrease
4009 + it. make_space() does. */
4010 +
4011 + ZF_SET(new_znode, JNODE_ORPHAN);
4012 + fresh->node = new_znode;
4013 +
4014 + while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
4015 + ref = carry_node_prev(ref);
4016 + assert("nikita-1606", !carry_node_end(doing, ref));
4017 + }
4018 +
4019 + info.todo = todo;
4020 + info.doing = doing;
4021 + add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
4022 + if (IS_ERR(add_pointer)) {
4023 + /* no need to deallocate @new_znode here: it will be
4024 + deallocated during carry error handling. */
4025 + return (carry_node *) add_pointer;
4026 + }
4027 +
4028 + add_pointer->u.insert.type = COPT_CHILD;
4029 + add_pointer->u.insert.child = fresh;
4030 + add_pointer->u.insert.brother = brother;
4031 + /* initially new node spawns empty key range */
4032 + write_lock_dk(znode_get_tree(brother));
4033 + znode_set_ld_key(new_znode,
4034 + znode_set_rd_key(new_znode,
4035 + znode_get_rd_key(brother)));
4036 + write_unlock_dk(znode_get_tree(brother));
4037 + return fresh;
4038 +}
4039 +
4040 +/* DEBUGGING FUNCTIONS.
4041 +
4042 + Probably we also should leave them on even when
4043 + debugging is turned off to print dumps at errors.
4044 +*/
4045 +#if REISER4_DEBUG
4046 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
4047 +{
4048 + carry_node *node;
4049 + carry_node *tmp_node;
4050 +
4051 + if (level == NULL)
4052 + return 0;
4053 +
4054 + if (level->track_type != 0 &&
4055 + level->track_type != CARRY_TRACK_NODE &&
4056 + level->track_type != CARRY_TRACK_CHANGE)
4057 + return 0;
4058 +
4059 + /* check that nodes are in ascending order */
4060 + for_all_nodes(level, node, tmp_node) {
4061 + znode *left;
4062 + znode *right;
4063 +
4064 + reiser4_key lkey;
4065 + reiser4_key rkey;
4066 +
4067 + if (node != carry_node_front(level)) {
4068 + if (state == CARRY_TODO) {
4069 + right = node->node;
4070 + left = carry_node_prev(node)->node;
4071 + } else {
4072 + right = carry_real(node);
4073 + left = carry_real(carry_node_prev(node));
4074 + }
4075 + if (right == NULL || left == NULL)
4076 + continue;
4077 + if (node_is_empty(right) || node_is_empty(left))
4078 + continue;
4079 + if (!keyle(leftmost_key_in_node(left, &lkey),
4080 + leftmost_key_in_node(right, &rkey))) {
4081 + warning("", "wrong key order");
4082 + return 0;
4083 + }
4084 + }
4085 + }
4086 + return 1;
4087 +}
4088 +#endif
4089 +
4090 +/* get symbolic name for boolean */
4091 +static const char *tf(int boolean /* truth value */ )
4092 +{
4093 + return boolean ? "t" : "f";
4094 +}
4095 +
4096 +/* symbolic name for carry operation */
4097 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
4098 +{
4099 + switch (op) {
4100 + case COP_INSERT:
4101 + return "COP_INSERT";
4102 + case COP_DELETE:
4103 + return "COP_DELETE";
4104 + case COP_CUT:
4105 + return "COP_CUT";
4106 + case COP_PASTE:
4107 + return "COP_PASTE";
4108 + case COP_UPDATE:
4109 + return "COP_UPDATE";
4110 + case COP_EXTENT:
4111 + return "COP_EXTENT";
4112 + case COP_INSERT_FLOW:
4113 + return "COP_INSERT_FLOW";
4114 + default:{
4115 + /* not mt safe, but who cares? */
4116 + static char buf[20];
4117 +
4118 + sprintf(buf, "unknown op: %x", op);
4119 + return buf;
4120 + }
4121 + }
4122 +}
4123 +
4124 +/* dump information about carry node */
4125 +static void print_carry(const char *prefix /* prefix to print */ ,
4126 + carry_node * node /* node to print */ )
4127 +{
4128 + if (node == NULL) {
4129 + printk("%s: null\n", prefix);
4130 + return;
4131 + }
4132 + printk
4133 + ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
4134 + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
4135 + tf(node->free), tf(node->deallocate));
4136 +}
4137 +
4138 +/* dump information about carry operation */
4139 +static void print_op(const char *prefix /* prefix to print */ ,
4140 + carry_op * op /* operation to print */ )
4141 +{
4142 + if (op == NULL) {
4143 + printk("%s: null\n", prefix);
4144 + return;
4145 + }
4146 + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
4147 + print_carry("\tnode", op->node);
4148 + switch (op->op) {
4149 + case COP_INSERT:
4150 + case COP_PASTE:
4151 + print_coord("\tcoord",
4152 + op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
4153 + print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
4154 + print_carry("\tchild", op->u.insert.child);
4155 + break;
4156 + case COP_DELETE:
4157 + print_carry("\tchild", op->u.delete.child);
4158 + break;
4159 + case COP_CUT:
4160 + if (op->u.cut_or_kill.is_cut) {
4161 + print_coord("\tfrom",
4162 + op->u.cut_or_kill.u.kill->params.from, 0);
4163 + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
4164 + 0);
4165 + } else {
4166 + print_coord("\tfrom",
4167 + op->u.cut_or_kill.u.cut->params.from, 0);
4168 + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
4169 + 0);
4170 + }
4171 + break;
4172 + case COP_UPDATE:
4173 + print_carry("\tleft", op->u.update.left);
4174 + break;
4175 + default:
4176 + /* do nothing */
4177 + break;
4178 + }
4179 +}
4180 +
4181 +/* dump information about all nodes and operations in a @level */
4182 +static void print_level(const char *prefix /* prefix to print */ ,
4183 + carry_level * level /* level to print */ )
4184 +{
4185 + carry_node *node;
4186 + carry_node *tmp_node;
4187 + carry_op *op;
4188 + carry_op *tmp_op;
4189 +
4190 + if (level == NULL) {
4191 + printk("%s: null\n", prefix);
4192 + return;
4193 + }
4194 + printk("%s: %p, restartable: %s\n",
4195 + prefix, level, tf(level->restartable));
4196 +
4197 + for_all_nodes(level, node, tmp_node)
4198 + print_carry("\tcarry node", node);
4199 + for_all_ops(level, op, tmp_op)
4200 + print_op("\tcarry op", op);
4201 +}
4202 +
4203 +/* Make Linus happy.
4204 + Local variables:
4205 + c-indentation-style: "K&R"
4206 + mode-name: "LC"
4207 + c-basic-offset: 8
4208 + tab-width: 8
4209 + fill-column: 120
4210 + scroll-step: 1
4211 + End:
4212 +*/
4213 diff -puN /dev/null fs/reiser4/carry.h
4214 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
4215 +++ linux-2.6.16-3-vs/fs/reiser4/carry.h 2006-05-30 18:51:50.088561250 +0400
4216 @@ -0,0 +1,442 @@
4217 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4218 +
4219 +/* Functions and data types to "carry" tree modification(s) upward.
4220 + See fs/reiser4/carry.c for details. */
4221 +
4222 +#if !defined( __FS_REISER4_CARRY_H__ )
4223 +#define __FS_REISER4_CARRY_H__
4224 +
4225 +#include "forward.h"
4226 +#include "debug.h"
4227 +#include "pool.h"
4228 +#include "znode.h"
4229 +
4230 +#include <linux/types.h>
4231 +
4232 +/* &carry_node - "location" of carry node.
4233 +
4234 + "location" of node that is involved or going to be involved into
4235 + carry process. Node where operation will be carried to on the
4236 + parent level cannot be recorded explicitly. Operation will be carried
4237 + usually to the parent of some node (where changes are performed at
4238 + the current level) or, to the left neighbor of its parent. But while
4239 + modifications are performed at the current level, parent may
4240 + change. So, we have to allow some indirection (or, positevly,
4241 + flexibility) in locating carry nodes.
4242 +
4243 +*/
4244 +typedef struct carry_node {
4245 + /* pool linkage */
4246 + reiser4_pool_header header;
4247 +
4248 + /* base node from which real_node is calculated. See
4249 + fs/reiser4/carry.c:lock_carry_node(). */
4250 + znode *node;
4251 +
4252 + /* how to get ->real_node */
4253 + /* to get ->real_node obtain parent of ->node */
4254 + __u32 parent:1;
4255 + /* to get ->real_node obtain left neighbor of parent of
4256 + ->node */
4257 + __u32 left:1;
4258 + __u32 left_before:1;
4259 +
4260 + /* locking */
4261 +
4262 + /* this node was locked by carry process and should be
4263 + unlocked when carry leaves a level */
4264 + __u32 unlock:1;
4265 +
4266 + /* disk block for this node was allocated by carry process and
4267 + should be deallocated when carry leaves a level */
4268 + __u32 deallocate:1;
4269 + /* this carry node was allocated by carry process and should be
4270 + freed when carry leaves a level */
4271 + __u32 free:1;
4272 +
4273 + /* type of lock we want to take on this node */
4274 + lock_handle lock_handle;
4275 +} carry_node;
4276 +
4277 +/* &carry_opcode - elementary operations that can be carried upward
4278 +
4279 + Operations that carry() can handle. This list is supposed to be
4280 + expanded.
4281 +
4282 + Each carry operation (cop) is handled by appropriate function defined
4283 + in fs/reiser4/carry.c. For example COP_INSERT is handled by
4284 + fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4285 + call plugins of nodes affected by operation to modify nodes' content
4286 + and to gather operations to be performed on the next level.
4287 +
4288 +*/
4289 +typedef enum {
4290 + /* insert new item into node. */
4291 + COP_INSERT,
4292 + /* delete pointer from parent node */
4293 + COP_DELETE,
4294 + /* remove part of or whole node. */
4295 + COP_CUT,
4296 + /* increase size of item. */
4297 + COP_PASTE,
4298 + /* insert extent (that is sequence of unformatted nodes). */
4299 + COP_EXTENT,
4300 + /* update delimiting key in least common ancestor of two
4301 + nodes. This is performed when items are moved between two
4302 + nodes.
4303 + */
4304 + COP_UPDATE,
4305 + /* insert flow */
4306 + COP_INSERT_FLOW,
4307 + COP_LAST_OP,
4308 +} carry_opcode;
4309 +
4310 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
4311 +
4312 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4313 + item is determined. */
4314 +typedef enum {
4315 + /* target item is one containing pointer to the ->child node */
4316 + COPT_CHILD,
4317 + /* target item is given explicitly by @coord */
4318 + COPT_ITEM_DATA,
4319 + /* target item is given by key */
4320 + COPT_KEY,
4321 + /* see insert_paste_common() for more comments on this. */
4322 + COPT_PASTE_RESTARTED,
4323 +} cop_insert_pos_type;
4324 +
4325 +/* flags to cut and delete */
4326 +typedef enum {
4327 + /* don't kill node even if it became completely empty as results of
4328 + * cut. This is needed for eottl handling. See carry_extent() for
4329 + * details. */
4330 + DELETE_RETAIN_EMPTY = (1 << 0)
4331 +} cop_delete_flag;
4332 +
4333 +/*
4334 + * carry() implements "lock handle tracking" feature.
4335 + *
4336 + * Callers supply carry with node where to perform initial operation and lock
4337 + * handle on this node. Trying to optimize node utilization carry may actually
4338 + * move insertion point to different node. Callers expect that lock handle
4339 + * will rebe transferred to the new node also.
4340 + *
4341 + */
4342 +typedef enum {
4343 + /* transfer lock handle along with insertion point */
4344 + CARRY_TRACK_CHANGE = 1,
4345 + /* acquire new lock handle to the node where insertion point is. This
4346 + * is used when carry() client doesn't initially possess lock handle
4347 + * on the insertion point node, for example, by extent insertion
4348 + * code. See carry_extent(). */
4349 + CARRY_TRACK_NODE = 2
4350 +} carry_track_type;
4351 +
4352 +/* data supplied to COP_{INSERT|PASTE} by callers */
4353 +typedef struct carry_insert_data {
4354 + /* position where new item is to be inserted */
4355 + coord_t *coord;
4356 + /* new item description */
4357 + reiser4_item_data *data;
4358 + /* key of new item */
4359 + const reiser4_key *key;
4360 +} carry_insert_data;
4361 +
4362 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4363 +struct cut_kill_params {
4364 + /* coord where cut starts (inclusive) */
4365 + coord_t *from;
4366 + /* coord where cut stops (inclusive, this item/unit will also be
4367 + * cut) */
4368 + coord_t *to;
4369 + /* starting key. This is necessary when item and unit pos don't
4370 + * uniquely identify what portion or tree to remove. For example, this
4371 + * indicates what portion of extent unit will be affected. */
4372 + const reiser4_key *from_key;
4373 + /* exclusive stop key */
4374 + const reiser4_key *to_key;
4375 + /* if this is not NULL, smallest actually removed key is stored
4376 + * here. */
4377 + reiser4_key *smallest_removed;
4378 + /* kill_node_content() is called for file truncate */
4379 + int truncate;
4380 +};
4381 +
4382 +struct carry_cut_data {
4383 + struct cut_kill_params params;
4384 +};
4385 +
4386 +struct carry_kill_data {
4387 + struct cut_kill_params params;
4388 + /* parameter to be passed to the ->kill_hook() method of item
4389 + * plugin */
4390 + /*void *iplug_params; *//* FIXME: unused currently */
4391 + /* if not NULL---inode whose items are being removed. This is needed
4392 + * for ->kill_hook() of extent item to update VM structures when
4393 + * removing pages. */
4394 + struct inode *inode;
4395 + /* sibling list maintenance is complicated by existence of eottl. When
4396 + * eottl whose left and right neighbors are formatted leaves is
4397 + * removed, one has to connect said leaves in the sibling list. This
4398 + * cannot be done when extent removal is just started as locking rules
4399 + * require sibling list update to happen atomically with removal of
4400 + * extent item. Therefore: 1. pointers to left and right neighbors
4401 + * have to be passed down to the ->kill_hook() of extent item, and
4402 + * 2. said neighbors have to be locked. */
4403 + lock_handle *left;
4404 + lock_handle *right;
4405 + /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4406 + unsigned flags;
4407 + char *buf;
4408 +};
4409 +
4410 +/* &carry_tree_op - operation to "carry" upward.
4411 +
4412 + Description of an operation we want to "carry" to the upper level of
4413 + a tree: e.g, when we insert something and there is not enough space
4414 + we allocate a new node and "carry" the operation of inserting a
4415 + pointer to the new node to the upper level, on removal of empty node,
4416 + we carry up operation of removing appropriate entry from parent.
4417 +
4418 + There are two types of carry ops: when adding or deleting node we
4419 + node at the parent level where appropriate modification has to be
4420 + performed is known in advance. When shifting items between nodes
4421 + (split, merge), delimiting key should be changed in the least common
4422 + parent of the nodes involved that is not known in advance.
4423 +
4424 + For the operations of the first type we store in &carry_op pointer to
4425 + the &carry_node at the parent level. For the operation of the second
4426 + type we store &carry_node or parents of the left and right nodes
4427 + modified and keep track of them upward until they coincide.
4428 +
4429 +*/
4430 +typedef struct carry_op {
4431 + /* pool linkage */
4432 + reiser4_pool_header header;
4433 + carry_opcode op;
4434 + /* node on which operation is to be performed:
4435 +
4436 + for insert, paste: node where new item is to be inserted
4437 +
4438 + for delete: node where pointer is to be deleted
4439 +
4440 + for cut: node to cut from
4441 +
4442 + for update: node where delimiting key is to be modified
4443 +
4444 + for modify: parent of modified node
4445 +
4446 + */
4447 + carry_node *node;
4448 + union {
4449 + struct {
4450 + /* (sub-)type of insertion/paste. Taken from
4451 + cop_insert_pos_type. */
4452 + __u8 type;
4453 + /* various operation flags. Taken from
4454 + cop_insert_flag. */
4455 + __u8 flags;
4456 + carry_insert_data *d;
4457 + carry_node *child;
4458 + znode *brother;
4459 + } insert, paste, extent;
4460 +
4461 + struct {
4462 + int is_cut;
4463 + union {
4464 + carry_kill_data *kill;
4465 + carry_cut_data *cut;
4466 + } u;
4467 + } cut_or_kill;
4468 +
4469 + struct {
4470 + carry_node *left;
4471 + } update;
4472 + struct {
4473 + /* changed child */
4474 + carry_node *child;
4475 + /* bitmask of changes. See &cop_modify_flag */
4476 + __u32 flag;
4477 + } modify;
4478 + struct {
4479 + /* flags to deletion operation. Are taken from
4480 + cop_delete_flag */
4481 + __u32 flags;
4482 + /* child to delete from parent. If this is
4483 + NULL, delete op->node. */
4484 + carry_node *child;
4485 + } delete;
4486 + struct {
4487 + /* various operation flags. Taken from
4488 + cop_insert_flag. */
4489 + __u32 flags;
4490 + flow_t *flow;
4491 + coord_t *insert_point;
4492 + reiser4_item_data *data;
4493 + /* flow insertion is limited by number of new blocks
4494 + added in that operation which do not get any data
4495 + but part of flow. This limit is set by macro
4496 + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4497 + of nodes added already during one carry_flow */
4498 + int new_nodes;
4499 + } insert_flow;
4500 + } u;
4501 +} carry_op;
4502 +
4503 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4504 +typedef struct carry_pool {
4505 + carry_op op[CARRIES_POOL_SIZE];
4506 + reiser4_pool op_pool;
4507 + carry_node node[NODES_LOCKED_POOL_SIZE];
4508 + reiser4_pool node_pool;
4509 +} carry_pool;
4510 +
4511 +/* &carry_tree_level - carry process on given level
4512 +
4513 + Description of balancing process on the given level.
4514 +
4515 + No need for locking here, as carry_tree_level is essentially per
4516 + thread thing (for now).
4517 +
4518 +*/
4519 +struct carry_level {
4520 + /* this level may be restarted */
4521 + __u32 restartable:1;
4522 + /* list of carry nodes on this level, ordered by key order */
4523 + struct list_head nodes;
4524 + struct list_head ops;
4525 + /* pool where new objects are allocated from */
4526 + carry_pool *pool;
4527 + int ops_num;
4528 + int nodes_num;
4529 + /* new root created on this level, if any */
4530 + znode *new_root;
4531 + /* This is set by caller (insert_by_key(), resize_item(), etc.) when
4532 + they want ->tracked to automagically wander to the node where
4533 + insertion point moved after insert or paste.
4534 + */
4535 + carry_track_type track_type;
4536 + /* lock handle supplied by user that we are tracking. See
4537 + above. */
4538 + lock_handle *tracked;
4539 +};
4540 +
4541 +/* information carry passes to plugin methods that may add new operations to
4542 + the @todo queue */
4543 +struct carry_plugin_info {
4544 + carry_level *doing;
4545 + carry_level *todo;
4546 +};
4547 +
4548 +int carry(carry_level * doing, carry_level * done);
4549 +
4550 +carry_node *add_carry(carry_level * level, pool_ordering order,
4551 + carry_node * reference);
4552 +carry_node *add_carry_skip(carry_level * level, pool_ordering order,
4553 + carry_node * reference);
4554 +
4555 +extern carry_node *insert_carry_node(carry_level * doing,
4556 + carry_level * todo, const znode * node);
4557 +
4558 +extern carry_pool *init_carry_pool(int);
4559 +extern void done_carry_pool(carry_pool * pool);
4560 +
4561 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4562 +
4563 +extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node,
4564 + int apply_to_parent);
4565 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4566 + znode * node, int apply_to_parent_p);
4567 +
4568 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4569 + carry_level * doing, carry_level * todo);
4570 +
4571 +carry_node *find_carry_node(carry_level * level, const znode * node);
4572 +
4573 +extern znode *carry_real(const carry_node * node);
4574 +
4575 +/* helper macros to iterate over carry queues */
4576 +
4577 +#define carry_node_next( node ) \
4578 + list_entry((node)->header.level_linkage.next, carry_node, \
4579 + header.level_linkage)
4580 +
4581 +#define carry_node_prev( node ) \
4582 + list_entry((node)->header.level_linkage.prev, carry_node, \
4583 + header.level_linkage)
4584 +
4585 +#define carry_node_front( level ) \
4586 + list_entry((level)->nodes.next, carry_node, header.level_linkage)
4587 +
4588 +#define carry_node_back( level ) \
4589 + list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4590 +
4591 +#define carry_node_end( level, node ) \
4592 + (&(level)->nodes == &(node)->header.level_linkage)
4593 +
4594 +/* macro to iterate over all operations in a @level */
4595 +#define for_all_ops( level /* carry level (of type carry_level *) */, \
4596 + op /* pointer to carry operation, modified by loop (of \
4597 + * type carry_op *) */, \
4598 + tmp /* pointer to carry operation (of type carry_op *), \
4599 + * used to make iterator stable in the face of \
4600 + * deletions from the level */ ) \
4601 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4602 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4603 + &op->header.level_linkage != &level->ops; \
4604 + op = tmp, \
4605 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4606 +
4607 +#if 0
4608 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4609 + tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4610 + ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4611 + op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4612 +#endif
4613 +
4614 +/* macro to iterate over all nodes in a @level */ \
4615 +#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4616 + node /* pointer to carry node, modified by loop (of \
4617 + * type carry_node *) */, \
4618 + tmp /* pointer to carry node (of type carry_node *), \
4619 + * used to make iterator stable in the face of * \
4620 + * deletions from the level */ ) \
4621 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4622 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4623 + &node->header.level_linkage != &level->nodes; \
4624 + node = tmp, \
4625 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4626 +
4627 +#if 0
4628 +for( node = carry_node_front( level ), \
4629 + tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4630 + node = tmp, tmp = carry_node_next( node ) )
4631 +#endif
4632 +
4633 +/* macro to iterate over all nodes in a @level in reverse order
4634 +
4635 + This is used, because nodes are unlocked in reversed order of locking */
4636 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4637 + node /* pointer to carry node, modified by loop \
4638 + * (of type carry_node *) */, \
4639 + tmp /* pointer to carry node (of type carry_node \
4640 + * *), used to make iterator stable in the \
4641 + * face of deletions from the level */ ) \
4642 +for( node = carry_node_back( level ), \
4643 + tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4644 + node = tmp, tmp = carry_node_prev( node ) )
4645 +
4646 +/* __FS_REISER4_CARRY_H__ */
4647 +#endif
4648 +
4649 +/* Make Linus happy.
4650 + Local variables:
4651 + c-indentation-style: "K&R"
4652 + mode-name: "LC"
4653 + c-basic-offset: 8
4654 + tab-width: 8
4655 + fill-column: 120
4656 + scroll-step: 1
4657 + End:
4658 +*/
4659 diff -puN /dev/null fs/reiser4/carry_ops.c
4660 --- /dev/null 2003-09-23 21:59:22.000000000 +0400
4661 +++ linux-2.6.16-3-vs/fs/reiser4/carry_ops.c 2006-05-30 18:51:50.100562000 +0400
4662 @@ -0,0 +1,2103 @@
4663 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4664 +
4665 +/* implementation of carry operations */
4666 +
4667 +#include "forward.h"
4668 +#include "debug.h"
4669 +#include "key.h"
4670 +#include "coord.h"
4671 +#include "plugin/item/item.h"
4672 +#include "plugin/node/node.h"
4673 +#include "jnode.h"
4674 +#include "znode.h"
4675 +#include "block_alloc.h"
4676 +#include "tree_walk.h"
4677 +#include "pool.h"
4678 +#include "tree_mod.h"
4679 +#include "carry.h"
4680 +#include "carry_ops.h"
4681 +#include "tree.h"
4682 +#include "super.h"
4683 +#include "reiser4.h"
4684 +
4685 +#include <linux/types.h>
4686 +#include <linux/err.h>
4687 +
4688 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4689 + carry_level * doing, carry_level * todo,
4690 + unsigned int including_insert_coord_p);
4691 +
4692 +extern int lock_carry_node(carry_level * level, carry_node * node);
4693 +extern int lock_carry_node_tail(carry_node * node);
4694 +
4695 +/* find left neighbor of a carry node
4696 +
4697 + Look for left neighbor of @node and add it to the @doing queue. See
4698 + comments in the body.
4699 +
4700 +*/
4701 +static carry_node *find_left_neighbor(carry_op * op /* node to find left
4702 + * neighbor of */ ,
4703 + carry_level * doing /* level to scan */ )
4704 +{
4705 + int result;
4706 + carry_node *node;
4707 + carry_node *left;
4708 + int flags;
4709 + reiser4_tree *tree;
4710 +
4711 + node = op->node;
4712 +
4713 + tree = current_tree;
4714 + read_lock_tree(tree);
4715 + /* first, check whether left neighbor is already in a @doing queue */
4716 + if (carry_real(node)->left != NULL) {
4717 + /* NOTE: there is locking subtlety here. Look into
4718 + * find_right_neighbor() for more info */
4719 + if (find_carry_node(doing, carry_real(node)->left) != NULL) {
4720 + read_unlock_tree(tree);
4721 + left = node;
4722 + do {
4723 + left = list_entry(left->header.level_linkage.prev,
4724 + carry_node, header.level_linkage);
4725 + assert("nikita-3408", !carry_node_end(doing,
4726 + left));
4727 + } while (carry_real(left) == carry_real(node));
4728 + return left;
4729 + }
4730 + }
4731 + read_unlock_tree(tree);
4732 +
4733 + left = add_carry_skip(doing, POOLO_BEFORE, node);
4734 + if (IS_ERR(left))
4735 + return left;
4736 +
4737 + left->node = node->node;
4738 + left->free = 1;
4739 +
4740 + flags = GN_TRY_LOCK;
4741 + if (!op->u.insert.flags & COPI_LOAD_LEFT)
4742 + flags |= GN_NO_ALLOC;
4743 +
4744 + /* then, feeling lucky, peek left neighbor in the cache. */
4745 + result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
4746 + ZNODE_WRITE_LOCK, flags);
4747 + if (result == 0) {
4748 + /* ok, node found and locked. */
4749 + result = lock_carry_node_tail(left);
4750 + if (result != 0)
4751 + left = ERR_PTR(result);
4752 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4753 + /* node is leftmost node in a tree, or neighbor wasn't in
4754 + cache, or there is an extent on the left. */
4755 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4756 + left = NULL;
4757 + } else if (doing->restartable) {
4758 + /* if left neighbor is locked, and level is restartable, add
4759 + new node to @doing and restart. */
4760 + assert("nikita-913", node->parent != 0);
4761 + assert("nikita-914", node->node != NULL);
4762 + left->left = 1;
4763 + left->free = 0;
4764 + left = ERR_PTR(-E_REPEAT);
4765 + } else {
4766 + /* left neighbor is locked, level cannot be restarted. Just
4767 + ignore left neighbor. */
4768 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4769 + left = NULL;
4770 + }
4771 + return left;
4772 +}
4773 +
4774 +/* find right neighbor of a carry node
4775 +
4776 + Look for right neighbor of @node and add it to the @doing queue. See
4777 + comments in the body.
4778 +
4779 +*/
4780 +static carry_node *find_right_neighbor(carry_op * op /* node to find right
4781 + * neighbor of */ ,
4782 + carry_level * doing /* level to scan */ )
4783 +{
4784 + int result;
4785 + carry_node *node;
4786 + carry_node *right;
4787 + lock_handle lh;
4788 + int flags;
4789 + reiser4_tree *tree;
4790 +
4791 + init_lh(&lh);
4792 +
4793 + node = op->node;
4794 +
4795 + tree = current_tree;
4796 + read_lock_tree(tree);
4797 + /* first, check whether right neighbor is already in a @doing queue */
4798 + if (carry_real(node)->right != NULL) {
4799 + /*
4800 + * Tree lock is taken here anyway, because, even if _outcome_
4801 + * of (find_carry_node() != NULL) doesn't depends on
4802 + * concurrent updates to ->right, find_carry_node() cannot
4803 + * work with second argument NULL. Hence, following comment is
4804 + * of historic importance only.
4805 + *
4806 + * Subtle:
4807 + *
4808 + * Q: why don't we need tree lock here, looking for the right
4809 + * neighbor?
4810 + *
4811 + * A: even if value of node->real_node->right were changed
4812 + * during find_carry_node() execution, outcome of execution
4813 + * wouldn't change, because (in short) other thread cannot add
4814 + * elements to the @doing, and if node->real_node->right
4815 + * already was in @doing, value of node->real_node->right
4816 + * couldn't change, because node cannot be inserted between
4817 + * locked neighbors.
4818 + */
4819 + if (find_carry_node(doing, carry_real(node)->right) != NULL) {
4820 + read_unlock_tree(tree);
4821 + /*
4822 + * What we are doing here (this is also applicable to
4823 + * the find_left_neighbor()).
4824 + *
4825 + * tree_walk.c code requires that insertion of a
4826 + * pointer to a child, modification of parent pointer
4827 + * in the child, and insertion of the child into
4828 + * sibling list are atomic (see
4829 + * plugin/item/internal.c:create_hook_internal()).
4830 + *
4831 + * carry allocates new node long before pointer to it
4832 + * is inserted into parent and, actually, long before
4833 + * parent is even known. Such allocated-but-orphaned
4834 + * nodes are only trackable through carry level lists.
4835 + *
4836 + * Situation that is handled here is following: @node
4837 + * has valid ->right pointer, but there is
4838 + * allocated-but-orphaned node in the carry queue that
4839 + * is logically between @node and @node->right. Here
4840 + * we are searching for it. Critical point is that
4841 + * this is only possible if @node->right is also in
4842 + * the carry queue (this is checked above), because
4843 + * this is the only way new orphaned node could be
4844 + * inserted between them (before inserting new node,
4845 + * make_space() first tries to shift to the right, so,
4846 + * right neighbor will be locked and queued).
4847 + *
4848 + */
4849 + right = node;
4850 + do {
4851 + right = list_entry(right->header.level_linkage.next,
4852 + carry_node, header.level_linkage);
4853 + assert("nikita-3408", !carry_node_end(doing,
4854 + right));
4855 + } while (carry_real(right) == carry_real(node));
4856 + return right;
4857 + }
4858 + }
4859 + read_unlock_tree(tree);
4860 +
4861 + flags = GN_CAN_USE_UPPER_LEVELS;
4862 + if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4863 + flags = GN_NO_ALLOC;
4864 +
4865 + /* then, try to lock right neighbor */
4866 + init_lh(&lh);
4867 + result = reiser4_get_right_neighbor(&lh, carry_real(node),
4868 + ZNODE_WRITE_LOCK, flags);
4869 + if (result == 0) {
4870 + /* ok, node found and locked. */
4871 + right = add_carry_skip(doing, POOLO_AFTER, node);
4872 + if (!IS_ERR(right)) {
4873 + right->node = lh.node;
4874 + move_lh(&right->lock_handle, &lh);
4875 + right->free = 1;
4876 + result = lock_carry_node_tail(right);
4877 + if (result != 0)
4878 + right = ERR_PTR(result);
4879 + }
4880 + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4881 + /* node is rightmost node in a tree, or neighbor wasn't in
4882 + cache, or there is an extent on the right. */
4883 + right = NULL;
4884 + } else
4885 + right = ERR_PTR(result);
4886 + done_lh(&lh);
4887 + return right;
4888 +}
4889 +
4890 +/* how much free space in a @node is needed for @op
4891 +
4892 + How much space in @node is required for completion of @op, where @op is
4893 + insert or paste operation.
4894 +*/
4895 +static unsigned int space_needed_for_op(znode * node /* znode data are
4896 + * inserted or
4897 + * pasted in */ ,
4898 + carry_op * op /* carry
4899 + operation */ )
4900 +{
4901 + assert("nikita-919", op != NULL);
4902 +
4903 + switch (op->op) {
4904 + default:
4905 + impossible("nikita-1701", "Wrong opcode");
4906 + case COP_INSERT:
4907 + return space_needed(node, NULL, op->u.insert.d->data, 1);
4908 + case COP_PASTE:
4909 + return space_needed(node, op->u.insert.d->coord,
4910 + op->u.insert.d->data, 0);
4911 + }
4912 +}
4913 +
4914 +/* how much space in @node is required to insert or paste @data at
4915 + @coord. */
4916 +unsigned int space_needed(const znode * node /* node data are inserted or
4917 + * pasted in */ ,
4918 + const coord_t * coord /* coord where data are
4919 + * inserted or pasted
4920 + * at */ ,
4921 + const reiser4_item_data * data /* data to insert or
4922 + * paste */ ,
4923 + int insertion /* non-0 is inserting, 0---paste */ )
4924 +{
4925 + int result;
4926 + item_plugin *iplug;
4927 +
4928 + assert("nikita-917", node != NULL);
4929 + assert("nikita-918", node_plugin_by_node(node) != NULL);
4930 + assert("vs-230", !insertion || (coord == NULL));
4931 +
4932 + result = 0;
4933 + iplug = data->iplug;
4934 + if (iplug->b.estimate != NULL) {
4935 + /* ask item plugin how much space is needed to insert this
4936 + item */
4937 + result += iplug->b.estimate(insertion ? NULL : coord, data);
4938 + } else {
4939 + /* reasonable default */
4940 + result += data->length;
4941 + }
4942 + if (insertion) {
4943 + node_plugin *nplug;
4944 +
4945 + nplug = node->nplug;
4946 + /* and add node overhead */
4947 + if (nplug->item_overhead != NULL) {
4948 + result += nplug->item_overhead(node, NULL);
4949 + }
4950 + }
4951 + return result;
4952 +}
4953 +
4954 +/* find &coord in parent where pointer to new child is to be stored. */
4955 +static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4956 + * insert pointer to new
4957 + * child */ )
4958 +{
4959 + int result;
4960 + znode *node;
4961 + znode *child;
4962 +
4963 + assert("nikita-941", op != NULL);
4964 + assert("nikita-942", op->op == COP_INSERT);
4965 +
4966 + node = carry_real(op->node);
4967 + assert("nikita-943", node != NULL);
4968 + assert("nikita-944", node_plugin_by_node(node) != NULL);
4969 +
4970 + child = carry_real(op->u.insert.child);
4971 + result =
4972 + find_new_child_ptr(node, child, op->u.insert.brother,
4973 + op->u.insert.d->coord);
4974 +
4975 + build_child_ptr_data(child, op->u.insert.d->data);
4976 + return result;
4977 +}
4978 +
4979 +/* additional amount of free space in @node required to complete @op */
4980 +static int free_space_shortage(znode * node /* node to check */ ,
4981 + carry_op * op /* operation being performed */ )
4982 +{
4983 + assert("nikita-1061", node != NULL);
4984 + assert("nikita-1062", op != NULL);
4985 +
4986 + switch (op->op) {
4987 + default:
4988 + impossible("nikita-1702", "Wrong opcode");
4989 + case COP_INSERT:
4990 + case COP_PASTE:
4991 + return space_needed_for_op(node, op) - znode_free_space(node);
4992 + case COP_EXTENT:
4993 + /* when inserting extent shift data around until insertion
4994 + point is utmost in the node. */
4995 + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4996 + return +1;
4997 + else
4998 + return -1;
4999 + }
5000 +}
5001 +
5002 +/* helper function: update node pointer in operation after insertion
5003 + point was probably shifted into @target. */
5004 +static znode *sync_op(carry_op * op, carry_node * target)
5005 +{
5006 + znode *insertion_node;
5007 +
5008 + /* reget node from coord: shift might move insertion coord to
5009 + the neighbor */
5010 + insertion_node = op->u.insert.d->coord->node;
5011 + /* if insertion point was actually moved into new node,
5012 + update carry node pointer in operation. */
5013 + if (insertion_node != carry_real(op->node)) {
5014 + op->node = target;
5015 + assert("nikita-2540", carry_real(target) == insertion_node);
5016 + }
5017 + assert("nikita-2541",
5018 + carry_real(op->node) == op->u.insert.d->coord->node);
5019 + return insertion_node;
5020 +}
5021 +
5022 +/*
5023 + * complete make_space() call: update tracked lock handle if necessary. See
5024 + * comments for fs/reiser4/carry.h:carry_track_type
5025 + */
5026 +static int
5027 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
5028 +{
5029 + int result;
5030 + carry_track_type tracking;
5031 + znode *node;
5032 +
5033 + tracking = doing->track_type;
5034 + node = op->u.insert.d->coord->node;
5035 +
5036 + if (tracking == CARRY_TRACK_NODE ||
5037 + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
5038 + /* inserting or pasting into node different from
5039 + original. Update lock handle supplied by caller. */
5040 + assert("nikita-1417", doing->tracked != NULL);
5041 + done_lh(doing->tracked);
5042 + init_lh(doing->tracked);
5043 + result = longterm_lock_znode(doing->tracked, node,
5044 + ZNODE_WRITE_LOCK,
5045 + ZNODE_LOCK_HIPRI);
5046 + } else
5047 + result = 0;
5048 + return result;
5049 +}
5050 +
5051 +/* This is insertion policy function. It shifts data to the left and right
5052 + neighbors of insertion coord and allocates new nodes until there is enough
5053 + free space to complete @op.
5054 +
5055 + See comments in the body.
5056 +
5057 + Assumes that the node format favors insertions at the right end of the node
5058 + as node40 does.
5059 +
5060 + See carry_flow() on detail about flow insertion
5061 +*/
5062 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
5063 + carry_level * doing /* current carry queue */ ,
5064 + carry_level * todo /* carry queue on the parent level */ )
5065 +{
5066 + znode *node;
5067 + int result;
5068 + int not_enough_space;
5069 + int blk_alloc;
5070 + znode *orig_node;
5071 + __u32 flags;
5072 +
5073 + coord_t *coord;
5074 +
5075 + assert("nikita-890", op != NULL);
5076 + assert("nikita-891", todo != NULL);
5077 + assert("nikita-892",
5078 + op->op == COP_INSERT ||
5079 + op->op == COP_PASTE || op->op == COP_EXTENT);
5080 + assert("nikita-1607",
5081 + carry_real(op->node) == op->u.insert.d->coord->node);
5082 +
5083 + flags = op->u.insert.flags;
5084 +
5085 + /* NOTE check that new node can only be allocated after checking left
5086 + * and right neighbors. This is necessary for proper work of
5087 + * find_{left,right}_neighbor(). */
5088 + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
5089 + flags & COPI_DONT_SHIFT_LEFT));
5090 + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
5091 + flags & COPI_DONT_SHIFT_RIGHT));
5092 +
5093 + coord = op->u.insert.d->coord;
5094 + orig_node = node = coord->node;
5095 +
5096 + assert("nikita-908", node != NULL);
5097 + assert("nikita-909", node_plugin_by_node(node) != NULL);
5098 +
5099 + result = 0;
5100 + /* If there is not enough space in a node, try to shift something to
5101 + the left neighbor. This is a bit tricky, as locking to the left is
5102 + low priority. This is handled by restart logic in carry().
5103 + */
5104 + not_enough_space = free_space_shortage(node, op);
5105 + if (not_enough_space <= 0)
5106 + /* it is possible that carry was called when there actually
5107 + was enough space in the node. For example, when inserting
5108 + leftmost item so that delimiting keys have to be updated.
5109 + */
5110 + return make_space_tail(op, doing, orig_node);
5111 + if (!(flags & COPI_DONT_SHIFT_LEFT)) {
5112 + carry_node *left;
5113 + /* make note in statistics of an attempt to move
5114 + something into the left neighbor */
5115 + left = find_left_neighbor(op, doing);
5116 + if (unlikely(IS_ERR(left))) {
5117 + if (PTR_ERR(left) == -E_REPEAT)
5118 + return -E_REPEAT;
5119 + else {
5120 + /* some error other than restart request
5121 + occurred. This shouldn't happen. Issue a
5122 + warning and continue as if left neighbor
5123 + weren't existing.
5124 + */
5125 + warning("nikita-924",
5126 + "Error accessing left neighbor: %li",
5127 + PTR_ERR(left));
5128 + }
5129 + } else if (left != NULL) {
5130 +
5131 + /* shift everything possible on the left of and
5132 + including insertion coord into the left neighbor */
5133 + result = carry_shift_data(LEFT_SIDE, coord,
5134 + carry_real(left), doing, todo,
5135 + flags & COPI_GO_LEFT);
5136 +
5137 + /* reget node from coord: shift_left() might move
5138 + insertion coord to the left neighbor */
5139 + node = sync_op(op, left);
5140 +
5141 + not_enough_space = free_space_shortage(node, op);
5142 + /* There is not enough free space in @node, but
5143 + may be, there is enough free space in
5144 + @left. Various balancing decisions are valid here.
5145 + The same for the shifiting to the right.
5146 + */
5147 + }
5148 + }
5149 + /* If there still is not enough space, shift to the right */
5150 + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
5151 + carry_node *right;
5152 +
5153 + right = find_right_neighbor(op, doing);
5154 + if (IS_ERR(right)) {
5155 + warning("nikita-1065",
5156 + "Error accessing right neighbor: %li",
5157 + PTR_ERR(right));
5158 + } else if (right != NULL) {
5159 + /* node containing insertion point, and its right
5160 + neighbor node are write locked by now.
5161 +
5162 + shift everything possible on the right of but
5163 + excluding insertion coord into the right neighbor
5164 + */
5165 + result = carry_shift_data(RIGHT_SIDE, coord,
5166 + carry_real(right),
5167 + doing, todo,
5168 + flags & COPI_GO_RIGHT);
5169 + /* reget node from coord: shift_right() might move
5170 + insertion coord to the right neighbor */
5171 + node = sync_op(op, right);
5172 + not_enough_space = free_space_shortage(node, op);
5173 + }
5174 + }
5175 + /* If there is still not enough space, allocate new node(s).
5176 +
5177 + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
5178 + the carry operation flags (currently this is needed during flush
5179 + only).
5180 + */
5181 + for (blk_alloc = 0;
5182 + not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
5183 + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
5184 + carry_node *fresh; /* new node we are allocating */
5185 + coord_t coord_shadow; /* remembered insertion point before
5186 + * shifting data into new node */
5187 + carry_node *node_shadow; /* remembered insertion node before
5188 + * shifting */
5189 + unsigned int gointo; /* whether insertion point should move
5190 + * into newly allocated node */
5191 +
5192 + /* allocate new node on the right of @node. Znode and disk
5193 + fake block number for new node are allocated.
5194 +
5195 + add_new_znode() posts carry operation COP_INSERT with
5196 + COPT_CHILD option to the parent level to add
5197 + pointer to newly created node to its parent.
5198 +
5199 + Subtle point: if several new nodes are required to complete
5200 + insertion operation at this level, they will be inserted
5201 + into their parents in the order of creation, which means
5202 + that @node will be valid "cookie" at the time of insertion.
5203 +
5204 + */
5205 + fresh = add_new_znode(node, op->node, doing, todo);
5206 + if (IS_ERR(fresh))
5207 + return PTR_ERR(fresh);
5208 +
5209 + /* Try to shift into new node. */
5210 + result = lock_carry_node(doing, fresh);
5211 + zput(carry_real(fresh));
5212 + if (result != 0) {
5213 + warning("nikita-947",
5214 + "Cannot lock new node: %i", result);
5215 + return result;
5216 + }
5217 +
5218 + /* both nodes are write locked by now.
5219 +
5220 + shift everything possible on the right of and
5221 + including insertion coord into the right neighbor.
5222 + */
5223 + coord_dup(&coord_shadow, op->u.insert.d->coord);
5224 + node_shadow = op->node;
5225 + /* move insertion point into newly created node if:
5226 +
5227 + . insertion point is rightmost in the source node, or
5228 + . this is not the first node we are allocating in a row.
5229 + */
5230 + gointo =
5231 + (blk_alloc > 0) ||
5232 + coord_is_after_rightmost(op->u.insert.d->coord);
5233 +
5234 + result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
5235 + doing, todo, gointo);
5236 + /* if insertion point was actually moved into new node,
5237 + update carry node pointer in operation. */
5238 + node = sync_op(op, fresh);
5239 + not_enough_space = free_space_shortage(node, op);
5240 + if ((not_enough_space > 0) && (node != coord_shadow.node)) {
5241 + /* there is not enough free in new node. Shift
5242 + insertion point back to the @shadow_node so that
5243 + next new node would be inserted between
5244 + @shadow_node and @fresh.
5245 + */
5246 + coord_normalize(&coord_shadow);
5247 + coord_dup(coord, &coord_shadow);
5248 + node = coord->node;
5249 + op->node = node_shadow;
5250 + if (1 || (flags & COPI_STEP_BACK)) {
5251 + /* still not enough space?! Maybe there is
5252 + enough space in the source node (i.e., node
5253 + data are moved from) now.
5254 + */
5255 + not_enough_space =
5256 + free_space_shortage(node, op);
5257 + }
5258 + }
5259 + }
5260 + if (not_enough_space > 0) {
5261 + if (!(flags & COPI_DONT_ALLOCATE))
5262 + warning("nikita-948", "Cannot insert new item");
5263 + result = -E_NODE_FULL;
5264 + }
5265 + assert("nikita-1622", ergo(result == 0,
5266 + carry_real(op->node) == coord->node));
5267 + assert("nikita-2616", coord == op->u.insert.d->coord);
5268 + if (result == 0)
5269 + result = make_space_tail(op, doing, orig_node);
5270 + return result;
5271 +}
5272 +
5273 +/* insert_paste_common() - common part of insert and paste operations
5274 +
5275 + This function performs common part of COP_INSERT and COP_PASTE.
5276 +
5277 + There are two ways in which insertion/paste can be requested:
5278 +
5279 + . by directly supplying reiser4_item_data. In this case, op ->
5280 + u.insert.type is set to COPT_ITEM_DATA.
5281 +
5282 + . by supplying child pointer to which is to inserted into parent. In this
5283 + case op -> u.insert.type == COPT_CHILD.
5284 +
5285 + . by supplying key of new item/unit. This is currently only used during
5286 + extent insertion
5287 +
5288 + This is required, because when new node is allocated we don't know at what
5289 + position pointer to it is to be stored in the parent. Actually, we don't
5290 + even know what its parent will be, because parent can be re-balanced
5291 + concurrently and new node re-parented, and because parent can be full and
5292 + pointer to the new node will go into some other node.
5293 +
5294 + insert_paste_common() resolves pointer to child node into position in the
5295 + parent by calling find_new_child_coord(), that fills
5296 + reiser4_item_data. After this, insertion/paste proceeds uniformly.
5297 +
5298 + Another complication is with finding free space during pasting. It may
5299 + happen that while shifting items to the neighbors and newly allocated
5300 + nodes, insertion coord can no longer be in the item we wanted to paste
5301 + into. At this point, paste becomes (morphs) into insert. Moreover free
5302 + space analysis has to be repeated, because amount of space required for
5303 + insertion is different from that of paste (item header overhead, etc).