diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 41455c65..0c339989 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -66,7 +66,7 @@ struct server_params { }; bool server_verbose = false; -bool server_log_json = true; +bool server_log_json = false; enum stop_type { STOP_FULL, @@ -266,7 +266,7 @@ struct server_slot { sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_prompt_processing", t_prompt_processing}, @@ -280,7 +280,7 @@ struct server_slot { sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", t_token_generation, n_decoded, t_token, n_tokens_second); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_token_generation", t_token_generation}, @@ -290,7 +290,7 @@ struct server_slot { }); sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - LOG_INFO(buffer, { + LOG_DEBUG(buffer, { {"slot_id", id}, {"task_id", task_id}, {"t_prompt_processing", t_prompt_processing}, @@ -371,7 +371,7 @@ struct llama_server_context { if (clp_ctx) { - LOG_INFO("freeing clip model", {}); + LOG_DEBUG("freeing clip model", {}); clip_free(clp_ctx); clp_ctx = nullptr; } @@ -392,7 +392,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_INFO("Multi Modal Mode Enabled", {}); + LOG_DEBUG("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -445,7 +445,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -454,7 +454,7 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_INFO("new slot", { + LOG_DEBUG("new slot", { {"slot_id", slot.id}, {"n_ctx_slot", slot.n_ctx} }); @@ -468,7 +468,7 @@ struct llama_server_context //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_INFO("slot self-extend", { + LOG_DEBUG("slot self-extend", { {"slot_id", slot.id}, {"ga_n", ga_n}, {"ga_w", ga_w} @@ -827,7 +827,7 @@ struct llama_server_context all_slots_are_idle = false; - LOG_INFO("slot is processing task", { + LOG_DEBUG("slot is processing task", { {"slot_id", slot->id}, {"task_id", slot->task_id}, }); @@ -1504,7 +1504,7 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_INFO("slot data", { + LOG_DEBUG("slot data", { {"task_id", task.id}, {"n_idle_slots", n_idle_slots}, {"n_processing_slots", n_processing_slots} @@ -1566,7 +1566,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_INFO("updating system prompt", {}); + LOG_DEBUG("updating system prompt", {}); system_prompt_update(); } @@ -1576,7 +1576,7 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); + LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; @@ -1599,7 +1599,7 @@ struct llama_server_context const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_INFO("slot context shift", { + LOG_DEBUG("slot context shift", { {"slot_id", slot.id}, {"task_id", slot.task_id}, {"n_keep", n_keep}, @@ -1638,7 +1638,7 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_INFO("slot released", { + LOG_DEBUG("slot released", { {"slot_id", slot.id}, {"task_id", slot.task_id}, {"n_ctx", n_ctx}, @@ -1807,7 +1807,7 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_INFO("slot progression", { + LOG_DEBUG("slot progression", { { "slot_id", slot.id }, { "task_id", slot.task_id }, { "n_past", slot.n_past }, @@ -1822,7 +1822,7 @@ struct llama_server_context if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { + LOG_DEBUG("we have to evaluate at least 1 token to generate logits", { { "slot_id", slot.id }, { "task_id", slot.task_id } }); @@ -1834,7 +1834,7 @@ struct llama_server_context } int p0 = (int) system_tokens.size() + slot.n_past; - LOG_INFO("kv cache rm [p0, end)", { + LOG_DEBUG("kv cache rm [p0, end)", { { "slot_id", slot.id }, { "task_id", slot.task_id }, { "p0", p0 } @@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } else if (arg == "-v" || arg == "--verbose") { -#if SERVER_VERBOSE != 1 - LOG_WARNING("server.cpp is not built with verbose logging.", {}); -#else server_verbose = true; -#endif } else if (arg == "--mlock") { @@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, else if (arg == "--log-disable") { log_set_target(stdout); - LOG_INFO("logging to file is disabled.", {}); + LOG_DEBUG("logging to file is disabled.", {}); } else if (arg == "--slots-endpoint-disable") { @@ -2732,7 +2728,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo return; } - LOG_INFO("request", { + LOG_DEBUG("request", { {"remote_addr", req.remote_addr}, {"remote_port", req.remote_port}, {"status", res.status}, diff --git a/llm/ext_server/utils.hpp b/llm/ext_server/utils.hpp index bd340656..d63ead04 100644 --- a/llm/ext_server/utils.hpp +++ b/llm/ext_server/utils.hpp @@ -55,9 +55,10 @@ extern bool server_log_json; } while (0) #endif -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_DEBUG( MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__) enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet @@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line, {"timestamp", time(nullptr)}, }; + if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) { + return; + } + if (server_log_json) { log.merge_patch( { @@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line, std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - if (!extra.empty()) { log.merge_patch(extra); } + std::stringstream ss; - ss << buf << " |"; + ss << level << " [" << function << "] " << message << " |"; for (const auto& el : log.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); diff --git a/llm/server.go b/llm/server.go index 9fe4cc17..27f471a0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -157,11 +157,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr "--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--embedding", } - if envconfig.Debug { - params = append(params, "--log-format", "json") - } else { - params = append(params, "--log-disable") - } + + params = append(params, "--log-disable") if opts.NumGPU >= 0 { params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))