-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbenchmark_inference_ram_and_flops_usage.py
More file actions
97 lines (88 loc) · 4 KB
/
benchmark_inference_ram_and_flops_usage.py
File metadata and controls
97 lines (88 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import time
import psutil
import os
from typing import Callable
def benchmark_inference(inference_func: Callable, *args, **kwargs) -> dict:
"""
Benchmark an inference task and measure FLOPS and RAM usage.
Args:
inference_func (Callable): The inference function to benchmark.
*args: Positional arguments to pass to the inference function.
**kwargs: Keyword arguments to pass to the inference function.
Returns:
dict: A dictionary containing the benchmark results.
"""
# Get the process ID of the current Python process
pid = os.getpid()
process = psutil.Process(pid)
# Measure the start time and initial memory usage
start_time = time.time()
initial_memory = process.memory_info().rss
# Run the inference function
result = inference_func(*args, **kwargs)
# Measure the end time and final memory usage
end_time = time.time()
final_memory = process.memory_info().rss
# Calculate the execution time and memory usage
execution_time = end_time - start_time
memory_used = final_memory - initial_memory
# Calculate the average memory usage
avg_memory = (initial_memory + final_memory) / 2
# Estimate the number of FLOPS (floating-point operations)
# This is a rough estimate based on the execution time and assumes a certain number of FLOPS per second
# Adjust the FLOPS_PER_SECOND value based on your system's specifications and the specific inference task
FLOPS_PER_SECOND = 1e9 # Assuming 1 GFLOPS (adjust this value based on your system)
estimated_flops = execution_time * FLOPS_PER_SECOND
# Create a dictionary with the benchmark results
benchmark_results = {
"execution_time": execution_time,
"memory_used": memory_used,
"avg_memory": avg_memory,
"estimated_flops": estimated_flops,
"result": result
}
return benchmark_results
# Example usage with llama_cpp
def llama_cpp_inference(prompt: str, model_path: str, n_threads: int = 4, n_predict: int = 128, temp: float = 0.8, top_k: int = 40, top_p: float = 0.9) -> str:
import llama_cpp
# Load the model
model = llama_cpp.Llama(model_path, n_threads=n_threads)
# Set the inference parameters
model.set_n_predict(n_predict)
model.set_temperature(temp)
model.set_top_k(top_k)
model.set_top_p(top_p)
# Run the inference
output = model.generate(prompt, max_tokens=n_predict)
# Clean up the generated output
result = output.strip()
return result
# Example usage with Huggingface Transformers
def huggingface_inference(prompt: str, model_name: str = "facebook/opt-1.3b", max_length: int = 128, num_beams: int = 4, no_repeat_ngram_size: int = 2, early_stopping: bool = True) -> str:
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Tokenize the input prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt")
# Run the inference
output = model.generate(
input_ids,
max_length=max_length,
num_beams=num_beams,
no_repeat_ngram_size=no_repeat_ngram_size,
early_stopping=early_stopping
)
# Decode the generated output
result = tokenizer.decode(output[0], skip_special_tokens=True)
return result
# Example usage of the benchmark_inference function
prompt = "The meaning of life is"
llama_model_path = "/path/to/llama/model"
llama_cpp_benchmark = benchmark_inference(llama_cpp_inference, prompt, model_path=llama_model_path, n_threads=4, n_predict=128, temp=0.8, top_k=40, top_p=0.9)
print("llama_cpp benchmark results:")
print(llama_cpp_benchmark)
huggingface_model_name = "facebook/opt-1.3b"
huggingface_benchmark = benchmark_inference(huggingface_inference, prompt, model_name=huggingface_model_name, max_length=128, num_beams=4, no_repeat_ngram_size=2, early_stopping=True)
print("Huggingface Transformers benchmark results:")
print(huggingface_benchmark)