1
0
mirror of synced 2025-12-23 21:03:15 -05:00

[source-mysql] don't do sampling for source-mysql (#55761)

Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
This commit is contained in:
Yue Li
2025-03-13 17:33:22 -07:00
committed by GitHub
parent e2ea32ec4e
commit c26d2ae755
3 changed files with 4 additions and 22 deletions

View File

@@ -201,27 +201,8 @@ class MySqlSourceOperations :
when (this) {
NoFrom -> ""
is From -> if (this.namespace == null) "FROM `$name`" else "FROM `$namespace`.`$name`"
is FromSample -> {
val from: String = From(name, namespace).sql()
// On a table that is very big we limit sampling to no less than 0.05%
// chance of a row getting picked. This comes at a price of bias to the beginning
// of table on very large tables ( > 100s million of rows)
val greatestRate: String = 0.00005.toString()
// We only do a full count in case information schema contains no row count.
// This is the case for views.
val fullCount = "SELECT COUNT(*) FROM `$namespace`.`$name`"
// Quick approximation to "select count(*) from table" which doesn't require
// full table scan. However, note this could give delayed summary info about a table
// and thus a new table could be treated as empty despite we recently added rows.
// To prevent that from happening and resulted for skipping the table altogether,
// the minimum count is set to 10.
val quickCount =
"SELECT GREATEST(10, COALESCE(table_rows, ($fullCount))) FROM information_schema.tables WHERE table_schema = '$namespace' AND table_name = '$name'"
val greatest = "GREATEST($greatestRate, $sampleSize / ($quickCount))"
// Rand returns a value between 0 and 1
val where = "WHERE RAND() < $greatest "
"$from $where"
}
// just return the first sample_size of rows from the table for the best performance
is FromSample -> From(name, namespace).sql()
}
fun WhereNode.sql(): String =