sqlite3 db [lindex $argv 0]
set DB [btree_open [lindex $argv 0] 1000 0]
-# In-memory database for collecting statistics
+# In-memory database for collecting statistics. This script loops through
+# the tables and indices in the database being analyzed, adding a row for each
+# to an in-memory database (for which the schema is shown below). It then
+# queries the in-memory db to produce the space-analysis report.
#
sqlite3 mem :memory:
set tabledef\
);}
mem eval $tabledef
-# This query will be used to find the root page number for every table
-# in the database.
+# Quote a string for use in an SQL query. Examples:
#
-set sql {
- SELECT name, rootpage
- FROM sqlite_master WHERE type='table'
- UNION ALL
- SELECT 'sqlite_master', 1
- ORDER BY 1
-}
-
-# Quote a string for SQL
+# [quote {hello world}] == {'hello world'}
+# [quote {hello world's}] == {'hello world''s'}
#
-proc quote txt {
+proc quote {txt} {
regsub -all ' $txt '' q
return '$q'
}
-# Analyze every table in the database, one at a time.
+# This proc is a wrapper around the btree_cursor_info command. The
+# second argument is an open btree cursor returned by [btree_cursor].
+# The first argument is the name of an array variable that exists in
+# the scope of the caller. If the third argument is non-zero, then
+# info is returned for the page that lies $up entries upwards in the
+# tree-structure. (i.e. $up==1 returns the parent page, $up==2 the
+# grandparent etc.)
+#
+# The following entries in that array are filled in with information retrieved
+# using [btree_cursor_info]:
+#
+# $arrayvar(page_no) = The page number
+# $arrayvar(entry_no) = The entry number
+# $arrayvar(page_entries) = Total number of entries on this page
+# $arrayvar(cell_size) = Cell size (local payload + header)
+# $arrayvar(page_freebytes) = Number of free bytes on this page
+# $arrayvar(page_freeblocks) = Number of free blocks on the page
+# $arrayvar(payload_bytes) = Total payload size (local + overflow)
+# $arrayvar(header_bytes) = Header size in bytes
+# $arrayvar(local_payload_bytes) = Local payload size
+# $arrayvar(parent) = Parent page number
+#
+proc cursor_info {arrayvar csr {up 0}} {
+ upvar $arrayvar a
+ foreach [list a(page_no) \
+ a(entry_no) \
+ a(page_entries) \
+ a(cell_size) \
+ a(page_freebytes) \
+ a(page_freeblocks) \
+ a(payload_bytes) \
+ a(header_bytes) \
+ a(local_payload_bytes) \
+ a(parent) ] [btree_cursor_info $csr $up] {}
+}
+
+# Determine the page-size of the database. This global variable is used
+# throughout the script.
#
set pageSize [db eval {PRAGMA page_size}]
+
+# Analyze every table in the database, one at a time.
+#
+# The following query returns the name and root-page of each table in the
+# database, including the sqlite_master table.
+#
+set sql {
+ SELECT name, rootpage FROM sqlite_master WHERE type='table'
+ UNION ALL
+ SELECT 'sqlite_master', 1
+ ORDER BY 1
+}
foreach {name rootpage} [db eval $sql] {
puts stderr "Analyzing table $name..."
- set cursor [btree_cursor $DB $rootpage 0]
- set go [btree_first $cursor]
- catch {unset seen}
+
+ # Code below traverses the table being analyzed (table name $name), using the
+ # btree cursor $cursor. Statistics related to table $name are accumulated in
+ # the following variables:
+ #
set total_payload 0 ;# Payload space used by all entries
set total_ovfl 0 ;# Payload space on overflow pages
set unused_int 0 ;# Unused space on interior nodes
set ovfl_pages 0 ;# Number of overflow pages used
set leaf_pages 0 ;# Number of leaf pages
set int_pages 0 ;# Number of interior pages
- while {$go==0} {
+
+ # As the btree is traversed, the array variable $seen($pgno) is set to 1
+ # the first time page $pgno is encountered.
+ #
+ catch {unset seen}
+
+ # The following loop runs once for each entry in table $name. The table
+ # is traversed using the btree cursor stored in variable $csr
+ #
+ set csr [btree_cursor $DB $rootpage 0]
+ for {btree_first $csr} {![btree_eof $csr]} {btree_next $csr} {
incr cnt_leaf_entry
- set stat [btree_cursor_info $cursor]
- set payload [lindex $stat 6]
- if {$payload>$mx_payload} {set mx_payload $payload}
- incr total_payload $payload
- set local [lindex $stat 8]
- set ovfl [expr {$payload-$local}]
+
+ # Retrieve information about the entry the btree-cursor points to into
+ # the array variable $ci (cursor info).
+ #
+ cursor_info ci $csr
+
+ # Check if the payload of this entry is greater than the current
+ # $mx_payload statistic for the table. Also increase the $total_payload
+ # statistic.
+ #
+ if {$ci(payload_bytes)>$mx_payload} {set mx_payload $ci(payload_bytes)}
+ incr total_payload $ci(payload_bytes)
+
+ # If this entry uses overflow pages, then update the $cnt_ovfl,
+ # $total_ovfl, $ovfl_pages and $unused_ovfl statistics.
+ #
+ set ovfl [expr {$ci(payload_bytes)-$ci(local_payload_bytes)}]
if {$ovfl} {
incr cnt_ovfl
incr total_ovfl $ovfl
incr ovfl_pages $n
incr unused_ovfl [expr {$n*($pageSize-4) - $ovfl}]
}
- set pgno [lindex $stat 0]
- if {![info exists seen($pgno)]} {
- set seen($pgno) 1
+
+ # If this is the first table entry analyzed for the page, then update
+ # the page-related statistics $leaf_pages and $unused_leaf. Also, if
+ # this page has a parent page that has not been analyzed, retrieve
+ # info for the parent and update statistics for it too.
+ #
+ if {![info exists seen($ci(page_no))]} {
+ set seen($ci(page_no)) 1
incr leaf_pages
- incr unused_leaf [lindex $stat 4]
- set parent [lindex $stat 9]
- set up 0
- while {$parent!=0 && ![info exists seen($parent)]} {
- incr up
- set stat [btree_cursor_info $cursor $up]
- set seen($parent) 1
+ incr unused_leaf $ci(page_freebytes)
+
+ # Now check if the page has a parent that has not been analyzed. If
+ # so, update the $int_pages, $cnt_int_entry and $unused_int statistics
+ # accordingly. Then check if the parent page has a parent that has
+ # not yet been analyzed etc.
+ #
+ # set parent $ci(parent_page_no)
+ for {set up 1} \
+ {$ci(parent)!=0 && ![info exists seen($ci(parent))]} {incr up} \
+ {
+ # Mark the parent as seen.
+ #
+ set seen($ci(parent)) 1
+
+ # Retrieve info for the parent and update statistics.
+ cursor_info ci $csr $up
incr int_pages
- incr cnt_int_entry [lindex $stat 2]
- incr unused_int [lindex $stat 4]
- set parent [lindex $stat 9]
+ incr cnt_int_entry $ci(page_entries)
+ incr unused_int $ci(page_freebytes)
}
}
- set go [btree_next $cursor]
}
- btree_close_cursor $cursor
+ btree_close_cursor $csr
+
+ # Handle the special case where a table contains no data. In this case
+ # all statistics are zero, except for the number of leaf pages (1) and
+ # the unused bytes on leaf pages ($pageSize - 8).
+ #
+ # An exception to the above is the sqlite_master table. If it is empty
+ # then all statistics are zero except for the number of leaf pages (1),
+ # and the number of unused bytes on leaf pages ($pageSize - 112).
+ #
if {[llength [array names seen]]==0} {
set leaf_pages 1
- set unused_leaf [expr {$pageSize-8}]
- } elseif {$rootpage==1 && ![info exists seen(1)]} {
- incr int_pages
- incr unused_int [expr {$pageSize-112}]
+ if {$rootpage==1} {
+ set unused_leaf [expr {$pageSize-112}]
+ } else {
+ set unused_leaf [expr {$pageSize-8}]
+ }
}
+
+ # Insert the statistics for the table analyzed into the in-memory database.
+ #
set sql "INSERT INTO space_used VALUES("
append sql [quote $name]
append sql ",[quote $name]"
mem eval $sql
}
-# This query will be used to find the root page number for every index
-# in the database.
+# Analyze every index in the database, one at a time.
+#
+# The query below returns the name, associated table and root-page number
+# for every index in the database.
#
set sql {
- SELECT name, tbl_name, rootpage
- FROM sqlite_master WHERE type='index'
+ SELECT name, tbl_name, rootpage FROM sqlite_master WHERE type='index'
ORDER BY 2, 1
}
-
-# Analyze every index in the database, one at a time.
-#
-set pageSize [db eval {PRAGMA page_size}]
foreach {name tbl_name rootpage} [db eval $sql] {
puts stderr "Analyzing index $name of table $tbl_name..."
- set cursor [btree_cursor $DB $rootpage 0]
- set go [btree_first $cursor]
- catch {unset seen}
+
+ # Code below traverses the index being analyzed (index name $name), using the
+ # btree cursor $cursor. Statistics related to index $name are accumulated in
+ # the following variables:
+ #
set total_payload 0 ;# Payload space used by all entries
set total_ovfl 0 ;# Payload space on overflow pages
set unused_leaf 0 ;# Unused space on leaf nodes
set mx_payload 0 ;# Maximum payload size
set ovfl_pages 0 ;# Number of overflow pages used
set leaf_pages 0 ;# Number of leaf pages
- while {$go==0} {
+
+ # As the btree is traversed, the array variable $seen($pgno) is set to 1
+ # the first time page $pgno is encountered.
+ #
+ catch {unset seen}
+
+ # The following loop runs once for each entry in index $name. The index
+ # is traversed using the btree cursor stored in variable $csr
+ #
+ set csr [btree_cursor $DB $rootpage 0]
+ for {btree_first $csr} {![btree_eof $csr]} {btree_next $csr} {
incr cnt_leaf_entry
- set stat [btree_cursor_info $cursor]
- set payload [btree_keysize $cursor]
+
+ # Retrieve information about the entry the btree-cursor points to into
+ # the array variable $ci (cursor info).
+ #
+ cursor_info ci $csr
+
+ # Check if the payload of this entry is greater than the current
+ # $mx_payload statistic for the table. Also increase the $total_payload
+ # statistic.
+ #
+ set payload [btree_keysize $csr]
if {$payload>$mx_payload} {set mx_payload $payload}
incr total_payload $payload
- set local [lindex $stat 8]
- set ovfl [expr {$payload-$local}]
+
+ # If this entry uses overflow pages, then update the $cnt_ovfl,
+ # $total_ovfl, $ovfl_pages and $unused_ovfl statistics.
+ #
+ set ovfl [expr {$payload-$ci(local_payload_bytes)}]
if {$ovfl} {
incr cnt_ovfl
incr total_ovfl $ovfl
incr ovfl_pages $n
incr unused_ovfl [expr {$n*($pageSize-4) - $ovfl}]
}
- set pgno [lindex $stat 0]
- if {![info exists seen($pgno)]} {
- set seen($pgno) 1
+
+ # If this is the first table entry analyzed for the page, then update
+ # the page-related statistics $leaf_pages and $unused_leaf.
+ #
+ if {![info exists seen($ci(page_no))]} {
+ set seen($ci(page_no)) 1
incr leaf_pages
- incr unused_leaf [lindex $stat 4]
+ incr unused_leaf $ci(page_freebytes)
}
- set go [btree_next $cursor]
}
- btree_close_cursor $cursor
+ btree_close_cursor $csr
+
+ # Handle the special case where a index contains no data. In this case
+ # all statistics are zero, except for the number of leaf pages (1) and
+ # the unused bytes on leaf pages ($pageSize - 8).
+ #
if {[llength [array names seen]]==0} {
set leaf_pages 1
set unused_leaf [expr {$pageSize-8}]
}
+
+ # Insert the statistics for the index analyzed into the in-memory database.
+ #
set sql "INSERT INTO space_used VALUES("
append sql [quote $name]
append sql ",[quote $tbl_name]"
# the $where clause determines which subset to analyze.
#
proc subreport {title where} {
- global pageSize
- set hit 0
+ global pageSize file_pgcnt
+
+ # Query the in-memory database for the sum of various statistics
+ # for the subset of tables/indices identified by the WHERE clause in
+ # $where. Note that even if the WHERE clause matches no rows, the
+ # following query returns exactly one row (because it is an aggregate).
+ #
+ # The results of the query are stored directly by SQLite into local
+ # variables (i.e. $nentry, $nleaf etc.).
+ #
mem eval "
SELECT
sum(nentry) AS nentry,
sum(leaf_unused) AS leaf_unused,
sum(int_unused) AS int_unused,
sum(ovfl_unused) AS ovfl_unused
- FROM space_used WHERE $where" {} {set hit 1}
- if {!$hit} {return 0}
+ FROM space_used WHERE $where" {} {}
+
+ # Output the sub-report title, nicely decorated with * characters.
+ #
puts ""
set len [string length $title]
- incr len 5
- set stars "***********************************"
- append stars $stars
- set stars [string range $stars $len end]
+ set stars [string repeat * [expr 65-$len]]
puts "*** $title $stars"
puts ""
+
+ # Calculate statistics and store the results in TCL variables, as follows:
+ #
+ # total_pages: Database pages consumed.
+ # total_pages_percent: Pages consumed as a percentage of the file.
+ # storage: Bytes consumed.
+ # payload_percent: Payload bytes used as a percentage of $storage.
+ # total_unused: Unused bytes on pages.
+ # avg_payload: Average payload per btree entry.
+ # avg_fanout: Average fanout for internal pages.
+ # avg_unused: Average unused bytes per btree entry.
+ # ovfl_cnt_percent: Percentage of btree entries that use overflow pages.
+ #
set total_pages [expr {$leaf_pages+$int_pages+$ovfl_pages}]
- statline "Percentage of total database" [percent $total_pages $::file_pgcnt]
- statline "Number of entries" $nleaf
- set total_unused [expr {$ovfl_unused+$int_unused+$leaf_unused}]
+ set total_pages_percent [percent $total_pages $file_pgcnt]
set storage [expr {$total_pages*$pageSize}]
- statline "Bytes of storage consumed" $storage
- statline "Bytes of payload" $payload \
- [percent $payload $storage {of storage consumed}]
- statline "Average payload per entry" [expr {$nleaf>0?$payload/$nleaf:0}]
- set avgunused [expr {$nleaf>0?$total_unused/$nleaf:0}]
- statline "Average unused bytes per entry" $avgunused
- set nint [expr {$nentry-$nleaf}]
+ set payload_percent [percent $payload $storage {of storage consumed}]
+ set total_unused [expr {$ovfl_unused+$int_unused+$leaf_unused}]
+ set avg_payload [expr {$nleaf>0?$payload/$nleaf:0}]
+ set avg_unused [expr {$nleaf>0?$total_unused/$nleaf:0}]
if {$int_pages>0} {
- statline "Average fanout" [format %.2f [expr {($nint+0.0)/$int_pages}]]
+ # TODO: Is this formula correct?
+ set avg_fanout [format %.2f [expr double($nentry-$nleaf)/$int_pages]]
}
- statline "Maximum payload per entry" $mx_payload
- statline "Entries that use overflow" $ovfl_cnt \
- [percent $ovfl_cnt $nleaf {of all entries}]
+ set ovfl_cnt_percent [percent $ovfl_cnt $nleaf {of all entries}]
+
+ # Print out the sub-report statistics.
+ #
+ statline {Percentage of total database} $total_pages_percent
+ statline {Number of entries} $nleaf
+ statline {Bytes of storage consumed} $storage
+ statline {Bytes of payload} $payload $payload_percent
+ statline {Average payload per entry} $avg_payload
+ statline {Average unused bytes per entry} $avg_unused
+ if {[info exists avg_fanout]} {
+ statline {Average fanout} $avg_fanout
+ }
+ statline {Maximum payload per entry} $mx_payload
+ statline {Entries that use overflow} $ovfl_cnt $ovfl_cnt_percent
if {$int_pages>0} {
- statline "Index pages used" $int_pages
+ statline {Index pages used} $int_pages
}
- statline "Primary pages used" $leaf_pages
- statline "Overflow pages used" $ovfl_pages
- statline "Total pages used" $total_pages
+ statline {Primary pages used} $leaf_pages
+ statline {Overflow pages used} $ovfl_pages
+ statline {Total pages used} $total_pages
if {$int_unused>0} {
- statline "Unused bytes on index pages" $int_unused \
+ set int_unused_percent \
[percent $int_unused [expr {$int_pages*$pageSize}] {of index space}]
+ statline "Unused bytes on index pages" $int_unused $int_unused_percent
}
statline "Unused bytes on primary pages" $leaf_unused \
[percent $leaf_unused [expr {$leaf_pages*$pageSize}] {of primary space}]
# This procedure calculates and returns the number of pages used by the
# auto-vacuum 'pointer-map'. If the database does not support auto-vacuum,
# then 0 is returned. The two arguments are the size of the database file in
-# bytes and the page size used by the database (also in bytes).
+# pages and the page size used by the database (in bytes).
proc autovacuum_overhead {filePages pageSize} {
# Read the value of meta 4. If non-zero, then the database supports
return [expr int(ceil( ($filePages-1.0)/($ptrsPerPage+1.0) ))]
}
-# Output summary statistics:
-#
-puts "/** Disk-Space Utilization Report For $file_to_analyze"
-puts "*** As of [clock format [clock seconds] -format {%Y-%b-%d %H:%M:%S}]"
-puts ""
-# Variables:
+# Calculate the summary statistics for the database and store the results
+# in TCL variables. They are output below. Variables are as follows:
#
# pageSize: Size of each page in bytes.
# file_bytes: File size in bytes.
# nindex: Number of indices in the db.
# nautoindex: Number of indices created automatically.
# nmanindex: Number of indices created manually.
-# user_payload:
+# user_payload: Number of bytes of payload in table btrees
+# (not including sqlite_master)
+# user_percent: $user_payload as a percentage of total file size.
set file_bytes [file size $file_to_analyze]
set file_pgcnt [expr {$file_bytes/$pageSize}]
set av_pgcnt [autovacuum_overhead $file_pgcnt $pageSize]
set av_percent [percent $av_pgcnt $file_pgcnt]
-set q {SELECT sum(leaf_pages+int_pages+ovfl_pages) FROM space_used}
-set inuse_pgcnt [expr [mem eval $q]]
+set sql {SELECT sum(leaf_pages+int_pages+ovfl_pages) FROM space_used}
+set inuse_pgcnt [expr [mem eval $sql]]
set inuse_percent [percent $inuse_pgcnt $file_pgcnt]
set free_pgcnt [expr $file_pgcnt-$inuse_pgcnt-$av_pgcnt]
set ntable [db eval {SELECT count(*)+1 FROM sqlite_master WHERE type='table'}]
set nindex [db eval {SELECT count(*) FROM sqlite_master WHERE type='index'}]
-set q {SELECT count(*) FROM sqlite_master WHERE name LIKE 'sqlite_autoindex%'}
-set nautoindex [db eval $q]
+set sql {SELECT count(*) FROM sqlite_master WHERE name LIKE 'sqlite_autoindex%'}
+set nautoindex [db eval $sql]
set nmanindex [expr {$nindex-$nautoindex}]
# set total_payload [mem eval "SELECT sum(payload) FROM space_used"]
set user_payload [mem one {SELECT sum(payload) FROM space_used
WHERE NOT is_index AND name NOT LIKE 'sqlite_master'}]
-set user_payload_percent [percent $user_payload $file_bytes]
+set user_percent [percent $user_payload $file_bytes]
+# Output the summary statistics calculated above.
+#
+puts "/** Disk-Space Utilization Report For $file_to_analyze"
+puts "*** As of [clock format [clock seconds] -format {%Y-%b-%d %H:%M:%S}]"
+puts ""
statline {Page size in bytes} $pageSize
statline {Pages in the whole file (measured)} $file_pgcnt
statline {Pages in the whole file (calculated)} $file_pgcnt2
statline {Number of named indices} $nmanindex
statline {Automatically generated indices} $nautoindex
statline {Size of the file in bytes} $file_bytes
-statline {Bytes of user payload stored} $user_payload $user_payload_percent
+statline {Bytes of user payload stored} $user_payload $user_percent
# Output table rankings
#
divided by the total number of bytes.
}
-# Output the database
+# Output a dump of the in-memory database. This can be used for more
+# complex offline analysis.
#
puts "**********************************************************************"
puts "The entire text of this report can be sourced into any SQL database"