ReTAG is a graph-based system for global sensemaking that synthesizes information and relationships across large-scale document collections. It leverages topic-augmented summarization and retrieval techniques to generate precise, context-rich answers to complex queries.
- Cuda 11.8
- Python 3.8.10
To install required libraries, run:
pip install -r requirements.txtPlease use the following files as your corpus:
- data/corpus/corpus_news_article.json
- data/corpus/corpus_podcast.json
Please use the following files as your query:
- data/query/news_article_total_questions.json
- data/query/podcast_total_questions.json
python data/process_corpus.py \
--base_dir "YOUR_EXPERIMENT_DIRECTORY" \
--input_path "data/corpus/corpus_podcast.json" \
--output_path "retag/data/chunks/podcast_chunks.json"(1) Entity/Relation Extraction
./baseline_entity_relation_graph.sh(2) Graph Construction
python entity_relation_graph/make_graph.py \
--base_dir "YOUR_EXPERIMENT_DIRECTORY" \
--input_path_reflect "output/entity_relation_graph/baseline/podcast/reflect.json" \
--input_path_gleaning "output/entity_relation_graph/baseline/podcast/gleaning.json" # Gleaning path \
--save_path "output/entity_relation_graph/baseline/podcast/graph.pkl" #Pickle file(1) Hierarchical Graph Clustering
python ./community_summarization/make_graph_hierarchy.py \
--base_dir "YOUR_EXPERIMENT_DIRECTORY" \
--graph_path "output/entity_relation_graph/baseline/podcast/graph.pkl" \
--hierarchical_graph_save_path "output/community_summarization/baseline/podcast/hierarchy.json" \
--children_dict_save_path "output/community_summarization/baseline/podcast/children_dict.json" \
--final_dict_save_path "output/community_summarization/baseline/podcast/final_dict.json"(2) Community Summarization
./baseline_community_summarization.sh./baseline_response_generation.sh./retag_topic_mining.sh #topic extract
python topic_mining/make_topic_total.py \
--base_dir "YOUR_EXPERIMENT_DIRECTORY" \
--input_path_prefix "output/topic_mining/response/podcast/topic" \
--max_level 5 \
--save_path "output/topic_mining/total_topic/podcast.json"
./retag_dataset_desc.sh./retag_entity_relation_graph.sh
python entity_relation_graph/make_graph_retag.py \
--base_dir "YOUR_EXPERIMENT_DIRECTORY" \
--input_path_reflect "output/entity_relation_graph/retag/podcast/reflect.json" \
--input_path_gleaning "output/entity_relation_graph/retag/podcast/gleaning.json" # Gleaning path \
--save_path_prefix "output/entity_relation_graph/retag/podcast/graph/graph" #Pickle filepython community_summarization/make_graph_hierarchy_retag.py \
--base_dir "YOUR_EXPERIMENT_DIRECTORY" \
--topic_data_path "output/topic_mining/total_topic/podcast.json" \
--graph_path_prefix "output/entity_relation_graph/retag/podcast/graph/graph" \
--hierarchical_graph_save_path_prefix "output/community_summarization/retag/podcast/hierarchy/hierarchy" \
--children_dict_save_path_prefix "output/community_summarization/retag/podcast/children_dict/children" \
--final_dict_save_path_prefix "output/community_summarization/retag/podcast/final_dict/final"
./retag_community_summarization.sh# Topic classification
python response_generation/topic_select_retag.py \
--model_id "Infermatic/Llama-3.3-70B-Instruct-FP8-Dynamic" \
--download_dir "/home/work/graph_construction/llama_3.3_70b_instruct_fp8" \
--num_gpus 2 \
--batch_size 256 \
--base_dir "path/to/your/exp/directory" \
--topic_data_path "output/topic_mining/total_topic/podcast.json" \
--question_data_path "data/query/podcast_total_questions.json" \
--dataset_desc_path "output/topic_mining/response/podcast/desc_0.json" \
--save_prefix "output/response_generation/retag/topic_select/podcast"
# Query keyword expansion
python response_generation/keyword_expansion_retag.py \
--model_id "Infermatic/Llama-3.3-70B-Instruct-FP8-Dynamic" \
--download_dir "/home/work/graph_construction/llama_3.3_70b_instruct_fp8" \
--num_gpus 2 \
--batch_size 256 \
--base_dir "path/to/your/exp/directory" \
--question_data_path "data/query/podcast_total_questions.json" \
--dataset_description_path "output/topic_mining/response/podcast/desc_0.json" \
--save_path_prefix "output/response_generation/retag/podcast/keywords"
# Retrieve community reports (max level can be different, you should check in graph hierarchy)
python response_generation/make_local_reports_retag.py \
--base_dir "path/to/your/exp/directory" \
--topic_data_path "output/topic_mining/total_topic/podcast.json" \
--community_report_prefix "output/community_summarization/retag/podcast/community_summary/community_summary" \
--hierarchy_prefix "output/community_summarization/retag/podcast/hierarchy/hierarchy" \
--final_dict_path_prefix "output/community_summarization/retag/podcast/final_dict/final" \
--local_reports_prefix "output/community_summarization/retag/podcast/local_reports/local_reports" \
--max_level 5
python response_generation/make_local_reports_baseline.py \
--base_dir "/home/work/graph_construction/retag" \
--community_report_prefix "output/community_summarization/baseline/podcast/community_summary" \
--hierarchy_path "output/community_summarization/baseline/podcast/hierarchy.json" \
--final_dict_path "output/community_summarization/baseline/podcast/final_dict.json" \
--local_reports_prefix "output/community_summarization/retag/podcast/local_reports/local_reports" \
--max_level 5
python response_generation/retrieval_retag.py \
--base_dir "path/to/your/exp/directory" \
--topic_data_path "output/response_generation/retag/topic_select/podcast.json" \
--keywords_data_path "output/response_generation/retag/podcast/keywords.json" \
--local_reports_prefix "output/community_summarization/retag/podcast/local_reports/local_reports" \
--hierarchy_prefix "output/community_summarization/retag/podcast/hierarchy/hierarchy" \
--hierarchy_backup "output/community_summarization/baseline/podcast/hierarchy.json" \
--save_prefix "output/community_summarization/retag/podcast/retrieved_community_summary/community_summary" \
--max_level 5
# response generation
./retag_response_generation.shpython evaluation/eval_gpt.py \
--model "gpt-4o-mini" \
--api_key "YOUR_API_KEY" \
--batch_size 64 \
--base_dir "path/to/your/exp/directory" \
--data1_prefix "output/response_generation/retag/podcast/final_response" \
--data1_max_level 5 \
--data2_prefix "output/response_generation/baseline/podcast/final_response" \
--data2_max_level 5 \
--question_data_path "data/query/podcast_total_questions.json" \
--response_prefix "output/evaluation/podcast"
# You can see the winning rate results in the command shell
python evaluation/eval_parsing.py \
--base_dir "path/to/your/exp/directory" \
--response_path "output/evaluation/podcast.json"
