log clean up

This commit is contained in:
Michael Yang 2024-05-09 13:52:56 -07:00
parent dc18eee39d
commit 58876091f7
3 changed files with 30 additions and 34 deletions

View file

@ -66,7 +66,7 @@ struct server_params {
}; };
bool server_verbose = false; bool server_verbose = false;
bool server_log_json = true; bool server_log_json = false;
enum stop_type { enum stop_type {
STOP_FULL, STOP_FULL,
@ -266,7 +266,7 @@ struct server_slot {
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, n_prompt_tokens_processed, t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second); t_token, n_tokens_second);
LOG_INFO(buffer, { LOG_DEBUG(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
{"t_prompt_processing", t_prompt_processing}, {"t_prompt_processing", t_prompt_processing},
@ -280,7 +280,7 @@ struct server_slot {
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded, t_token_generation, n_decoded,
t_token, n_tokens_second); t_token, n_tokens_second);
LOG_INFO(buffer, { LOG_DEBUG(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
{"t_token_generation", t_token_generation}, {"t_token_generation", t_token_generation},
@ -290,7 +290,7 @@ struct server_slot {
}); });
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
LOG_INFO(buffer, { LOG_DEBUG(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
{"t_prompt_processing", t_prompt_processing}, {"t_prompt_processing", t_prompt_processing},
@ -371,7 +371,7 @@ struct llama_server_context
{ {
if (clp_ctx) if (clp_ctx)
{ {
LOG_INFO("freeing clip model", {}); LOG_DEBUG("freeing clip model", {});
clip_free(clp_ctx); clip_free(clp_ctx);
clp_ctx = nullptr; clp_ctx = nullptr;
} }
@ -392,7 +392,7 @@ struct llama_server_context
params = params_; params = params_;
if (!params.mmproj.empty()) { if (!params.mmproj.empty()) {
multimodal = true; multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {}); LOG_DEBUG("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) { if(clp_ctx == nullptr) {
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@ -445,7 +445,7 @@ struct llama_server_context
const int32_t n_ctx_slot = n_ctx / params.n_parallel; const int32_t n_ctx_slot = n_ctx / params.n_parallel;
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}});
for (int i = 0; i < params.n_parallel; i++) for (int i = 0; i < params.n_parallel; i++)
{ {
server_slot slot; server_slot slot;
@ -454,7 +454,7 @@ struct llama_server_context
slot.n_ctx = n_ctx_slot; slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict; slot.n_predict = params.n_predict;
LOG_INFO("new slot", { LOG_DEBUG("new slot", {
{"slot_id", slot.id}, {"slot_id", slot.id},
{"n_ctx_slot", slot.n_ctx} {"n_ctx_slot", slot.n_ctx}
}); });
@ -468,7 +468,7 @@ struct llama_server_context
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
LOG_INFO("slot self-extend", { LOG_DEBUG("slot self-extend", {
{"slot_id", slot.id}, {"slot_id", slot.id},
{"ga_n", ga_n}, {"ga_n", ga_n},
{"ga_w", ga_w} {"ga_w", ga_w}
@ -827,7 +827,7 @@ struct llama_server_context
all_slots_are_idle = false; all_slots_are_idle = false;
LOG_INFO("slot is processing task", { LOG_DEBUG("slot is processing task", {
{"slot_id", slot->id}, {"slot_id", slot->id},
{"task_id", slot->task_id}, {"task_id", slot->task_id},
}); });
@ -1504,7 +1504,7 @@ struct llama_server_context
} }
slots_data.push_back(slot_data); slots_data.push_back(slot_data);
} }
LOG_INFO("slot data", { LOG_DEBUG("slot data", {
{"task_id", task.id}, {"task_id", task.id},
{"n_idle_slots", n_idle_slots}, {"n_idle_slots", n_idle_slots},
{"n_processing_slots", n_processing_slots} {"n_processing_slots", n_processing_slots}
@ -1566,7 +1566,7 @@ struct llama_server_context
bool update_slots() { bool update_slots() {
if (system_need_update) if (system_need_update)
{ {
LOG_INFO("updating system prompt", {}); LOG_DEBUG("updating system prompt", {});
system_prompt_update(); system_prompt_update();
} }
@ -1576,7 +1576,7 @@ struct llama_server_context
{ {
if (system_prompt.empty() && clean_kv_cache) if (system_prompt.empty() && clean_kv_cache)
{ {
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {});
kv_cache_clear(); kv_cache_clear();
} }
return true; return true;
@ -1599,7 +1599,7 @@ struct llama_server_context
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
const int n_discard = n_left / 2; const int n_discard = n_left / 2;
LOG_INFO("slot context shift", { LOG_DEBUG("slot context shift", {
{"slot_id", slot.id}, {"slot_id", slot.id},
{"task_id", slot.task_id}, {"task_id", slot.task_id},
{"n_keep", n_keep}, {"n_keep", n_keep},
@ -1638,7 +1638,7 @@ struct llama_server_context
slot.command = NONE; slot.command = NONE;
slot.t_last_used = ggml_time_us(); slot.t_last_used = ggml_time_us();
LOG_INFO("slot released", { LOG_DEBUG("slot released", {
{"slot_id", slot.id}, {"slot_id", slot.id},
{"task_id", slot.task_id}, {"task_id", slot.task_id},
{"n_ctx", n_ctx}, {"n_ctx", n_ctx},
@ -1807,7 +1807,7 @@ struct llama_server_context
slot.ga_i = ga_i; slot.ga_i = ga_i;
} }
LOG_INFO("slot progression", { LOG_DEBUG("slot progression", {
{ "slot_id", slot.id }, { "slot_id", slot.id },
{ "task_id", slot.task_id }, { "task_id", slot.task_id },
{ "n_past", slot.n_past }, { "n_past", slot.n_past },
@ -1822,7 +1822,7 @@ struct llama_server_context
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
{ {
// we have to evaluate at least 1 token to generate logits. // we have to evaluate at least 1 token to generate logits.
LOG_INFO("we have to evaluate at least 1 token to generate logits", { LOG_DEBUG("we have to evaluate at least 1 token to generate logits", {
{ "slot_id", slot.id }, { "slot_id", slot.id },
{ "task_id", slot.task_id } { "task_id", slot.task_id }
}); });
@ -1834,7 +1834,7 @@ struct llama_server_context
} }
int p0 = (int) system_tokens.size() + slot.n_past; int p0 = (int) system_tokens.size() + slot.n_past;
LOG_INFO("kv cache rm [p0, end)", { LOG_DEBUG("kv cache rm [p0, end)", {
{ "slot_id", slot.id }, { "slot_id", slot.id },
{ "task_id", slot.task_id }, { "task_id", slot.task_id },
{ "p0", p0 } { "p0", p0 }
@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
else if (arg == "-v" || arg == "--verbose") else if (arg == "-v" || arg == "--verbose")
{ {
#if SERVER_VERBOSE != 1
LOG_WARNING("server.cpp is not built with verbose logging.", {});
#else
server_verbose = true; server_verbose = true;
#endif
} }
else if (arg == "--mlock") else if (arg == "--mlock")
{ {
@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (arg == "--log-disable") else if (arg == "--log-disable")
{ {
log_set_target(stdout); log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {}); LOG_DEBUG("logging to file is disabled.", {});
} }
else if (arg == "--slots-endpoint-disable") else if (arg == "--slots-endpoint-disable")
{ {
@ -2732,7 +2728,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
return; return;
} }
LOG_INFO("request", { LOG_DEBUG("request", {
{"remote_addr", req.remote_addr}, {"remote_addr", req.remote_addr},
{"remote_port", req.remote_port}, {"remote_port", req.remote_port},
{"status", res.status}, {"status", res.status},

View file

@ -55,9 +55,10 @@ extern bool server_log_json;
} while (0) } while (0)
#endif #endif
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_DEBUG( MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
enum server_state { enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
{"timestamp", time(nullptr)}, {"timestamp", time(nullptr)},
}; };
if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
return;
}
if (server_log_json) { if (server_log_json) {
log.merge_patch( log.merge_patch(
{ {
@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
} else { } else {
char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
if (!extra.empty()) { if (!extra.empty()) {
log.merge_patch(extra); log.merge_patch(extra);
} }
std::stringstream ss; std::stringstream ss;
ss << buf << " |"; ss << level << " [" << function << "] " << message << " |";
for (const auto& el : log.items()) for (const auto& el : log.items())
{ {
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);

View file

@ -157,11 +157,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
"--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--embedding", "--embedding",
} }
if envconfig.Debug {
params = append(params, "--log-format", "json") params = append(params, "--log-disable")
} else {
params = append(params, "--log-disable")
}
if opts.NumGPU >= 0 { if opts.NumGPU >= 0 {
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU)) params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))