mirror of
https://github.com/apache/impala.git
synced 2025-12-30 03:01:44 -05:00
Change-Id: I5bfe77f9a871018e7a67553ed270e2df53006962 Reviewed-on: http://gerrit.cloudera.org:8080/4361 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins
106 lines
3.7 KiB
Python
Executable File
106 lines
3.7 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
##############################################################################
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
##############################################################################
|
|
#
|
|
# This script will warm up the buffer cache with the tables required to run the input
|
|
# query. This only works on a mini-dfs cluster. This is remarkably difficult to do
|
|
# since hdfs which tries to hide the details of the block locations from users.
|
|
# The only way to do this is to
|
|
# 1. use the java APIs (deprecated, of course) to extract the block ids.
|
|
# 2. find the files with those block ids on the file system and read them
|
|
#
|
|
# First run testdata/bin/generate-block-ids.sh. This will output the block locations
|
|
# to testdata/block-ids. This file is good as long as the mini-dfs cluster does not
|
|
# get new files. If the block-ids file is not there, this script will run
|
|
# generate-block-ids.sh.
|
|
#
|
|
# Run this script, passing it the query and it will go read every replica of every
|
|
# block of every table in the query.
|
|
import math
|
|
import os
|
|
import re
|
|
import sys
|
|
import subprocess
|
|
import tempfile
|
|
from optparse import OptionParser
|
|
|
|
# Options
|
|
parser = OptionParser()
|
|
parser.add_option("-q", "--query", dest="query", default = "",
|
|
help="Query to run. If none specified, runs all queries.")
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
block_ids_file = 'testdata/block-ids'
|
|
data_node_root = os.environ['MINI_DFS_BASE_DATA_DIR'] + '/dfs/data'
|
|
block_ids = {}
|
|
|
|
# Parse the block ids file to all the block ids for all the tables
|
|
# the format of the file is:
|
|
# <table name>: <block_id1> <block_id2> <etc>
|
|
def parse_block_ids():
|
|
full_path = os.environ['IMPALA_HOME'] + "/" + block_ids_file;
|
|
if not os.path.isfile(full_path):
|
|
cmd = os.environ['IMPALA_HOME'] + '/testdata/bin/generate-block-ids.sh'
|
|
os.system(cmd)
|
|
|
|
if not os.path.isfile(full_path):
|
|
raise Exception("Could not find/generate block id files: " + full_path)
|
|
|
|
f = open(full_path);
|
|
for line in f:
|
|
tokens = line.split(':')
|
|
blocks = tokens[1].strip().split(' ')
|
|
block_ids[tokens[0].strip()] = blocks
|
|
|
|
# Parse for the tables used in this query
|
|
def parse_tables(query):
|
|
table_predecessor = ['from', 'join']
|
|
tokens = query.split(' ')
|
|
tables = []
|
|
next_is_table = False
|
|
for t in tokens:
|
|
t = t.lower()
|
|
if next_is_table:
|
|
tables.append(t)
|
|
next_is_table = False
|
|
if t in table_predecessor:
|
|
next_is_table = True
|
|
return tables
|
|
|
|
# Warm the buffer cache by cat-ing all the blocks to /dev/null
|
|
def warm_buffer_cache(table):
|
|
if table not in block_ids:
|
|
raise Exception("Table not found: " + table)
|
|
|
|
blocks = block_ids[table]
|
|
for block in blocks:
|
|
cmd = 'find %s -type f -name blk_%s* -exec cat {} > /dev/null \;' % \
|
|
(data_node_root, block)
|
|
os.system(cmd)
|
|
|
|
tables = parse_tables(options.query)
|
|
parse_block_ids()
|
|
|
|
if len(tables) == 0:
|
|
raise Exception("Could not parse tables in: " + options.query)
|
|
|
|
for table in tables:
|
|
warm_buffer_cache(table)
|