diff -Naru innobase-5.4.1-orig/buf/buf0flu.c innobase-5.4.1-dim/buf/buf0flu.c --- innobase-5.4.1-orig/buf/buf0flu.c 2009-06-24 01:31:12.000000000 +0200 +++ innobase-5.4.1-dim/buf/buf0flu.c 2009-08-11 01:26:26.000000000 +0200 @@ -851,6 +851,8 @@ #endif /* UNIV_SYNC_DEBUG */ mutex_enter(&(buf_pool->mutex)); + srv_buf_flush_calls++; + if ((buf_pool->n_flush[flush_type] > 0) || (buf_pool->init_flush[flush_type] == TRUE)) { @@ -862,6 +864,7 @@ } (buf_pool->init_flush)[flush_type] = TRUE; + srv_buf_flush_exec++; for (;;) { /* If we have flushed enough, leave the loop */ @@ -920,6 +923,8 @@ flush_type, offset, page_count - old_page_count); */ + srv_buf_flush_pages += page_count - old_page_count; + mutex_enter(&(buf_pool->mutex)); } else if (flush_type == BUF_FLUSH_LRU) { @@ -1053,6 +1058,8 @@ n_to_flush = buf_flush_LRU_recommendation(); if (n_to_flush > 0) { + srv_buf_flush_LRU_calls++; + n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, ut_dulint_zero); if (n_flushed == ULINT_UNDEFINED) { diff -Naru innobase-5.4.1-orig/include/srv0srv.h innobase-5.4.1-dim/include/srv0srv.h --- innobase-5.4.1-orig/include/srv0srv.h 2009-06-24 01:31:15.000000000 +0200 +++ innobase-5.4.1-dim/include/srv0srv.h 2009-08-11 10:29:00.000000000 +0200 @@ -184,6 +184,23 @@ extern ulint srv_activity_count; extern ulint srv_fatal_semaphore_wait_threshold; extern ulint srv_dml_needed_delay; +extern ulint srv_dml_needed_delay_max; +extern ulint srv_dml_delayed; + +extern ulint srv_buf_flush_calls; +extern ulint srv_buf_flush_exec; +extern ulint srv_buf_flush_pages; +extern ulint srv_buf_flush_recv_calls; +extern ulint srv_buf_flush_log_calls; +extern ulint srv_buf_flush_LRU_calls; +extern ulint srv_buf_flush_master_calls; +extern ulint srv_buf_flush_ahead_calls; +extern ulint srv_buf_flush_dirtypct_calls; + +extern ulint srv_purge_calls; +extern ulint srv_purge_exec; +extern ulint srv_purge_pages; +extern ulint srv_purge_sleeps; extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs, query threads, and lock table: we allocate @@ -358,15 +375,26 @@ moment */ ulint type, /* in: thread type */ ulint n); /* in: number of threads to release */ + /************************************************************************* The master thread controlling the server. */ - os_thread_ret_t srv_master_thread( /*==============*/ /* out: a dummy parameter */ void* arg); /* in: a dummy parameter required by os_thread_create */ + +/************************************************************************* +The undo purge thread. */ +os_thread_ret_t +srv_purge_thread( +/*==============*/ + /* out: a dummy parameter */ + void* arg); /* in: a dummy parameter required by + os_thread_create */ + + /*********************************************************************** Tells the Innobase server that there has been activity in the database and wakes up the master thread if it is suspended (not sleeping). Used @@ -489,7 +517,8 @@ not currently in use */ #define SRV_INSERT 6 /* thread flushing the insert buffer to disk, not currently in use */ -#define SRV_MASTER 7 /* the master thread, (whose type number must +#define SRV_PURGE 7 /* thread purging undo records */ +#define SRV_MASTER 8 /* the master thread, (whose type number must be biggest) */ /* Thread slot in the thread table */ diff -Naru innobase-5.4.1-orig/log/log0log.c innobase-5.4.1-dim/log/log0log.c --- innobase-5.4.1-orig/log/log0log.c 2009-06-24 01:31:17.000000000 +0200 +++ innobase-5.4.1-dim/log/log0log.c 2009-08-11 01:21:53.000000000 +0200 @@ -1599,6 +1599,8 @@ recv_apply_hashed_log_recs(TRUE); } + srv_buf_flush_log_calls++; + n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, new_oldest); if (sync) { diff -Naru innobase-5.4.1-orig/log/log0recv.c innobase-5.4.1-dim/log/log0recv.c --- innobase-5.4.1-orig/log/log0recv.c 2009-06-24 01:31:17.000000000 +0200 +++ innobase-5.4.1-dim/log/log0recv.c 2009-08-11 10:24:57.000000000 +0200 @@ -1503,6 +1503,7 @@ mutex_exit(&(recv_sys->mutex)); mutex_exit(&(log_sys->mutex)); + srv_buf_flush_recv_calls++; n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, ut_dulint_max); ut_a(n_pages != ULINT_UNDEFINED); diff -Naru innobase-5.4.1-orig/row/row0mysql.c innobase-5.4.1-dim/row/row0mysql.c --- innobase-5.4.1-orig/row/row0mysql.c 2009-06-24 01:31:17.000000000 +0200 +++ innobase-5.4.1-dim/row/row0mysql.c 2009-08-09 18:29:27.000000000 +0200 @@ -91,6 +91,7 @@ /*===========================*/ { if (srv_dml_needed_delay) { + srv_dml_delayed++; os_thread_sleep(srv_dml_needed_delay); } } diff -Naru innobase-5.4.1-orig/srv/srv0srv.c innobase-5.4.1-dim/srv/srv0srv.c --- innobase-5.4.1-orig/srv/srv0srv.c 2009-06-24 01:31:18.000000000 +0200 +++ innobase-5.4.1-dim/srv/srv0srv.c 2009-08-26 19:04:47.000000000 +0200 @@ -62,7 +62,29 @@ /* How much data manipulation language (DML) statements need to be delayed, in microseconds, in order to reduce the lagging of the purge thread. */ -ulint srv_dml_needed_delay = 0; +ulint srv_dml_needed_delay = 0; +ulint srv_dml_needed_delay_max = 0; +ulint srv_dml_delayed = 0; + +/* Calls to trx_purge */ +ulint srv_purge_calls = 0; +ulint srv_purge_exec = 0; +ulint srv_purge_pages = 0; +ulint srv_purge_sleeps = 0; + +/* Calls of buf_flush_batch and really executed calls */ +ulint srv_buf_flush_calls = 0; +ulint srv_buf_flush_exec = 0; +ulint srv_buf_flush_pages = 0; +ulint srv_buf_flush_recv_calls = 0; +ulint srv_buf_flush_log_calls = 0; +ulint srv_buf_flush_LRU_calls = 0; +ulint srv_buf_flush_master_calls = 0; +ulint srv_buf_flush_ahead_calls = 0; +ulint srv_buf_flush_dirtypct_calls = 0; + +/* Ahead FLushing */ +ibool srv_ahead_flushing = TRUE; ibool srv_lock_timeout_and_monitor_active = FALSE; ibool srv_error_monitor_active = FALSE; @@ -422,6 +444,10 @@ ulint srv_main_thread_process_no = 0; ulint srv_main_thread_id = 0; +ulint srv_purge_thread_process_no = 0; +ulint srv_purge_thread_id = 0; + + /* The following count work done by srv_master_thread. */ /* Iterations by the 'once per second' loop */ @@ -439,6 +465,7 @@ /* Calls to log_buffer_flush_maybe_sync */ ulint srv_async_flush = 0; + /* Number of microseconds threads wait because of innodb_thread_concurrency */ static ib_longlong srv_thread_wait_mics = 0; @@ -1973,6 +2000,30 @@ srv_n_rows_deleted_old = srv_n_rows_deleted; srv_n_rows_read_old = srv_n_rows_read; + fputs("------------\n" + "PURGE STATUS\n" + "------------\n", file); + fprintf( file, "History len: %d\n" + "DML Delay: %d max: %d delayed-queries: %d \n" + "Purge calls: %d exec: %d sleeps: %d purged-pages: %d \n", + trx_sys->rseg_history_len, + srv_dml_needed_delay, srv_dml_needed_delay_max, srv_dml_delayed, + srv_purge_calls, srv_purge_exec, srv_purge_sleeps, srv_purge_pages ); + + srv_dml_needed_delay_max= 0; + + + fputs("------------\n" + "BUFFER FLUSH\n" + "------------\n", file); + fprintf( file, "Flush calls: %d exec: %d flushed-pages: %d \n" + "Called-by recovery: %d redolog: %d LRU: %d master: %d ahead: %d dirty-pct: %d\n", + srv_buf_flush_calls, srv_buf_flush_exec, srv_buf_flush_pages, + srv_buf_flush_recv_calls, srv_buf_flush_log_calls, srv_buf_flush_LRU_calls, + srv_buf_flush_master_calls, srv_buf_flush_ahead_calls, + srv_buf_flush_dirtypct_calls ); + + fputs("----------------------------\n" "END OF INNODB MONITOR OUTPUT\n" "============================\n", file); @@ -2507,6 +2558,7 @@ /* Try to keep the number of modified pages in the buffer pool under the limit wished by the user */ + srv_buf_flush_dirtypct_calls++; n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); @@ -2516,8 +2568,47 @@ to flush. Do not sleep 1 second during the next iteration of this loop. */ - skip_sleep = TRUE; - } + if( n_pages_flushed != ULINT_UNDEFINED ) skip_sleep = TRUE; + + } else if( srv_ahead_flushing ) { + + /* Flush ahead dirty pages to avoid oldest modification age + coming too close to the max checkpoint age OR dirty pct coming + too close to the limit set by user.. + -Dimitri */ + + dulint lsn; + int pct= 0; + + lsn = buf_pool_get_oldest_modification(); + + if( !ut_dulint_is_zero(lsn) ) { + + pct= (int)( ut_dulint_minus( log_sys->lsn, lsn ) * 100 / log_sys->max_checkpoint_age ); + pct= pct < 25 ? 0 : (pct < 75 ? pct - 20 : 80 ); + } + + if( UNIV_UNLIKELY(buf_get_modified_ratio_pct() + > srv_max_buf_pool_modified_pct / 2)) { + + srv_buf_flush_ahead_calls++; + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, + PCT_IO( pct > 40 ? pct : 40 ), + ut_dulint_max); + + if( n_pages_flushed != ULINT_UNDEFINED ) skip_sleep = TRUE; + + } else if( pct ) { + + srv_buf_flush_ahead_calls++; + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, + PCT_IO( pct ), + ut_dulint_max); + + if( n_pages_flushed != ULINT_UNDEFINED ) skip_sleep = TRUE; + + } + } if (srv_activity_count == old_activity_count) { @@ -2552,6 +2643,7 @@ n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) { srv_main_thread_op_info = "flushing buffer pool pages"; + srv_buf_flush_master_calls++; buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); srv_main_thread_op_info = "flushing log"; @@ -2571,34 +2663,6 @@ log_buffer_flush_maybe_sync(); srv_async_flush++; - /* We run a full purge every 10 seconds, even if the server - were active */ - - n_pages_purged = 1; - - last_flush_time = time(NULL); - - while (n_pages_purged) { - - if (srv_fast_shutdown && srv_shutdown_state > 0) { - - goto background_loop; - } - - srv_main_thread_op_info = "purging"; - n_pages_purged = trx_purge(); - - current_time = time(NULL); - - if (difftime(current_time, last_flush_time) > 1) { - srv_main_thread_op_info = "flushing log"; - - log_buffer_flush_to_disk(); - last_flush_time = current_time; - srv_sync_flush++; - } - } - srv_main_thread_op_info = "flushing buffer pool pages"; /* Flush a few oldest pages to make a new checkpoint younger */ @@ -2609,6 +2673,7 @@ (> 70 %), we assume we can afford reserving the disk(s) for the time it requires to flush 100 pages */ + srv_buf_flush_dirtypct_calls++; n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); @@ -2617,6 +2682,7 @@ we do not unnecessarily use much disk i/o capacity from other work */ + srv_buf_flush_master_calls++; n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), ut_dulint_max); @@ -2666,34 +2732,6 @@ os_thread_sleep(100000); } - srv_main_thread_op_info = "purging"; - - /* Run a full purge */ - - n_pages_purged = 1; - - last_flush_time = time(NULL); - - while (n_pages_purged) { - if (srv_fast_shutdown && srv_shutdown_state > 0) { - - break; - } - - srv_main_thread_op_info = "purging"; - n_pages_purged = trx_purge(); - - current_time = time(NULL); - - if (difftime(current_time, last_flush_time) > 1) { - srv_main_thread_op_info = "flushing log"; - - log_buffer_flush_to_disk(); - last_flush_time = current_time; - srv_sync_flush++; - } - } - srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); @@ -2728,6 +2766,7 @@ srv_main_thread_op_info = "flushing buffer pool pages"; srv_main_flush_loops++; if (srv_fast_shutdown < 2) { + srv_buf_flush_master_calls++; n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); @@ -2855,4 +2894,93 @@ OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */ } + + +/************************************************************************* +The purge was originally part of srv_master_thread. It was observed with +high delete/update workloads that srv_master_thread would end up looping +in the purge, not performing any of its other background tasks, including +flushing the buffer pool. By moving this function to it's own thread the +full purge can occur without impacting other background tasks. +Based on idea of Tim Cook & Vince Carbone */ + +os_thread_ret_t +srv_purge_thread( +/*==============*/ +/* out: a dummy parameter */ +void* arg __attribute__((unused))) + /* in: a dummy parameter required by os_thread_create */ +{ + ulint n_pages_purged; + ulint exec; + ulint sleep10s= 1000; /* sleep power between 10ms and 10s (1 & 1000) */ + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Purge thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + + srv_purge_thread_process_no = os_proc_get_number(); + srv_purge_thread_id = os_thread_pf(os_thread_get_curr_id()); + + srv_table_reserve_slot(SRV_PURGE); + mutex_enter(&kernel_mutex); + srv_n_threads_active[SRV_PURGE]++; + mutex_exit(&kernel_mutex); + exec= srv_purge_exec; + +loop: + /* we run a full purge every 10ms - 10sec, even if the server + was inactive (the timeout is auto-adaptive) */ + + if( exec == srv_purge_exec ) { + + /* increase timeout if there was no purge execution */ + if( sleep10s < 1000 ) sleep10s *= 2; + if( sleep10s > 1000 ) sleep10s= 1000; + + } + else { + + /* leave only 10ms sleeping if a purge busrt waked-up instantly */ + if( srv_purge_exec - exec > 100000 && sleep10s == 1000 ) + sleep10s= 20; + + /* decrease timeout if purge is active */ + if( sleep10s > 1 ) sleep10s /= 2; + if( sleep10s < 1 ) sleep10s= 1; + + } + + srv_purge_sleeps++; + os_thread_sleep( sleep10s * 10000 ); + exec= srv_purge_exec; + + do { + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + goto exit_func; + } + + if( n_pages_purged = trx_purge() ) { + srv_purge_exec++; + srv_purge_pages += n_pages_purged; + } + + srv_purge_calls++; + + } while (n_pages_purged); + + goto loop; + +exit_func: + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; + +} + #endif /* !UNIV_HOTBACKUP */ diff -Naru innobase-5.4.1-orig/srv/srv0start.c innobase-5.4.1-dim/srv/srv0start.c --- innobase-5.4.1-orig/srv/srv0start.c 2009-06-24 01:31:18.000000000 +0200 +++ innobase-5.4.1-dim/srv/srv0start.c 2009-08-06 20:21:20.000000000 +0200 @@ -1654,6 +1654,11 @@ os_thread_create(&srv_master_thread, NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS)); + + /* Create the purge thread which does purge of undo records */ + os_thread_create(&srv_purge_thread, NULL, thread_ids + + (4 + SRV_MAX_N_IO_THREADS)); + #ifdef UNIV_DEBUG /* buf_debug_prints = TRUE; */ #endif /* UNIV_DEBUG */ diff -Naru innobase-5.4.1-orig/trx/trx0purge.c innobase-5.4.1-dim/trx/trx0purge.c --- innobase-5.4.1-orig/trx/trx0purge.c 2009-06-24 01:31:18.000000000 +0200 +++ innobase-5.4.1-dim/trx/trx0purge.c 2009-08-15 20:42:05.000000000 +0200 @@ -1058,8 +1058,17 @@ /* If we cannot advance the 'purge view' because of an old 'consistent read view', then the DML statements cannot be delayed. Also, srv_max_purge_lag <= 0 means 'infinity'. */ - if (srv_max_purge_lag > 0 - && !UT_LIST_GET_LAST(trx_sys->view_list)) { + +/* ** Commented: the only way currently to decrease a growing purge lag is +// ** to slow down DML queries, whatever they have or don't have +// ** consistent reading... +// ** -Dimitri +// +// if (srv_max_purge_lag > 0 +// && !UT_LIST_GET_LAST(trx_sys->view_list)) { +*/ + + if (srv_max_purge_lag > 0 ) { float ratio = (float) trx_sys->rseg_history_len / srv_max_purge_lag; if (ratio > ULINT_MAX / 10000) { @@ -1072,6 +1081,11 @@ by at least 5000 microseconds. */ srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); } + + /* Limit the max delay to 50ms TODO: should be a conf parameter? */ + if( srv_dml_needed_delay > 50000 ) srv_dml_needed_delay= 50000; + if( srv_dml_needed_delay > srv_dml_needed_delay_max) + srv_dml_needed_delay_max= srv_dml_needed_delay; } purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero,