Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions modules/core.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1594,8 +1594,8 @@ function generate_consolidated_report() {
# Top assets from hotlist (if present)
if [[ -s hotlist.txt ]] && command -v jq >/dev/null 2>&1; then
top_assets_json=$(head -n "${HOTLIST_TOP:-50}" hotlist.txt \
| awk '{score=$1;$1=""; sub(/^ /,"",$0); printf "{\"asset\":\"%s\",\"score\":%s}\n",$0,score}' \
| jq -s '.')
| awk '{score=$1; $1=""; sub(/^ /,"",$0); print score "\t" $0}' \
| jq -Rn '[inputs | split("\t") | {asset: .[1], score: (.[0] | tonumber? // 0)}]')
else
top_assets_json="[]"
fi
Expand All @@ -1611,10 +1611,10 @@ function generate_consolidated_report() {
| awk -F'] ' '{
ts=$1; gsub(/^\[/,"",ts);
msg=$2;
if (msg ~ /Start function:/) { print "{\"timestamp\":\"" ts "\",\"level\":\"INFO\",\"function\":\"" msg "\",\"message\":\"started\"}" }
else if (msg ~ /End function:/) { print "{\"timestamp\":\"" ts "\",\"level\":\"SUCCESS\",\"function\":\"" msg "\",\"message\":\"completed\"}" }
if (msg ~ /Start function:/) { print ts "\tINFO\t" msg "\tstarted" }
else if (msg ~ /End function:/) { print ts "\tSUCCESS\t" msg "\tcompleted" }
}' \
| jq -s '.')
| jq -Rn '[inputs | split("\t") | {timestamp: .[0], level: .[1], function: .[2], message: .[3]}]')
else
timeline_json="[]"
fi
Expand Down
13 changes: 13 additions & 0 deletions modules/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,19 @@ function sanitize_ip() {
return 0
}

# Sanitize a single entry from a -l list file.
# Detects IP/CIDR vs domain and applies the appropriate sanitizer.
# Outputs the sanitized value; returns 1 if the entry is invalid.
# Usage: domain=$(_sanitize_list_entry "$raw") || continue
_sanitize_list_entry() {
local raw="$1"
if [[ "$raw" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(/[0-9]+)?$ ]]; then
sanitize_ip "$raw"
else
sanitize_domain "$raw"
fi
}

###############################################################################################################
####################################### SECURITY CHECKS #######################################################
###############################################################################################################
Expand Down
132 changes: 96 additions & 36 deletions modules/web.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,44 @@ _run_httpx() {
fi
}

# Normalize wildcard-prefixed hosts in URL-like lines.
# Examples:
# *.api.example.com -> api.example.com
# https://*.api.example.com -> https://api.example.com
_normalize_probe_urls() {
sed -E 's#^(https?://)\*\.#\1#; s#^\*\.##'
}

# Detect whether a probe output file contains JSONL.
# Returns 0 when the first non-empty line starts with "{", else 1.
_probe_output_is_json() {
local input_file="$1"
local first_line
first_line="$(awk 'NF {print; exit}' "$input_file" 2>/dev/null || true)"
[[ "$first_line" =~ ^[[:space:]]*\{ ]]
}

# Extract in-scope URLs from a probe file that may be JSONL or plain URL list.
# Usage: _extract_probe_urls <input_file> <domain_filter> <output_file>
_extract_probe_urls() {
local input_file="$1"
local dom_filter="$2"
local output_file="$3"

[[ ! -s "$input_file" ]] && return 0

if _probe_output_is_json "$input_file"; then
jq -r 'try (.url // empty)' "$input_file" 2>/dev/null \
| awk -v dom="$dom_filter" 'index($0, dom) && $0 ~ /^https?:\/\// {print}' \
| _normalize_probe_urls \
| anew_q_safe "$output_file"
else
awk -v dom="$dom_filter" 'index($0, dom) && $0 ~ /^https?:\/\// {print}' "$input_file" 2>/dev/null \
| _normalize_probe_urls \
| anew_q_safe "$output_file"
fi
}

# Process httpx JSON output: extract URLs and web info
# Usage: _process_httpx_output json_file url_output info_output
_process_httpx_output() {
Expand All @@ -51,7 +89,7 @@ _process_httpx_output() {
jq -r 'try .url' "$json_file" 2>/dev/null \
| grep "$domain" \
| grep -aEo 'https?://[^ ]+' \
| sed 's/*.//' \
| _normalize_probe_urls \
| anew_q_safe "$url_output"

# Extract plain web info
Expand Down Expand Up @@ -126,11 +164,12 @@ function webprobe_simple() {

# webprobe_simple is expected to write JSONL when using httpx -json.
# Some runners (or wrappers) may produce a plain URL list instead.
# Detect the format early to avoid jq parse errors and missing webs/webs.txt.
local probe_first_line probe_is_json
probe_first_line="$(awk 'NF {print; exit}' .tmp/web_full_info_probe.txt 2>/dev/null || true)"
local probe_is_json probe_input_lines urls_extracted
probe_is_json=false
[[ "$probe_first_line" =~ ^[[:space:]]*\\{ ]] && probe_is_json=true
if _probe_output_is_json ".tmp/web_full_info_probe.txt"; then
probe_is_json=true
fi
probe_input_lines=$(awk 'NF {c++} END {print c+0}' .tmp/web_full_info_probe.txt 2>/dev/null)

# Always start fresh for this run (used by urlchecks diff too).
: >.tmp/probed_tmp.txt 2>/dev/null || true
Expand All @@ -141,7 +180,7 @@ function webprobe_simple() {
if ! cat .tmp/web_full_info_probe.txt .tmp/web_full_info.txt 2>>"$LOGFILE" \
| jq -cs 'unique_by(.input)[]' 2>>"$LOGFILE" >webs/web_full_info.txt; then
log_note "webprobe_simple: failed to merge httpx JSON; falling back to probe-only" "${FUNCNAME[0]}" "${LINENO}"
awk 'match($0, /^[[:space:]]*\\{/) {print}' .tmp/web_full_info_probe.txt >.tmp/web_full_info_merge_input.jsonl 2>/dev/null || true
awk 'match($0, /^[[:space:]]*\{/) {print}' .tmp/web_full_info_probe.txt >.tmp/web_full_info_merge_input.jsonl 2>/dev/null || true
if [[ -s ".tmp/web_full_info_merge_input.jsonl" ]]; then
jq -cs 'unique_by(.input)[]' .tmp/web_full_info_merge_input.jsonl 2>>"$LOGFILE" >webs/web_full_info.txt || : >webs/web_full_info.txt
else
Expand All @@ -150,20 +189,20 @@ function webprobe_simple() {
fi
# Keep cache as JSONL for later merges.
cp webs/web_full_info.txt .tmp/web_full_info.txt 2>/dev/null || true

# Extract URLs from JSONL
if [[ -s "webs/web_full_info.txt" ]]; then
jq -r 'try (.url // empty)' webs/web_full_info.txt 2>/dev/null \
| awk -v dom="$domain" 'index($0, dom) && $0 ~ /^https?:\\/\\// {print}' \
| sed 's/*.//' | anew_q_safe .tmp/probed_tmp.txt
fi
else
log_note "webprobe_simple: probe output not JSON; treating as URL list" "${FUNCNAME[0]}" "${LINENO}"
if [[ -s ".tmp/web_full_info_probe.txt" ]]; then
awk -v dom="$domain" 'index($0, dom) && $0 ~ /^https?:\\/\\// {print}' .tmp/web_full_info_probe.txt 2>/dev/null \
| sed 's/*.//' | anew_q_safe .tmp/probed_tmp.txt
fi
fi
_extract_probe_urls ".tmp/web_full_info_probe.txt" "$domain" ".tmp/probed_tmp.txt" || true
urls_extracted=$(awk 'NF {c++} END {print c+0}' .tmp/probed_tmp.txt 2>/dev/null)

# Fallback: if extraction from probe output produced nothing, try cached JSON.
if [[ "${urls_extracted:-0}" -eq 0 ]] && [[ -s ".tmp/web_full_info.txt" ]] && _probe_output_is_json ".tmp/web_full_info.txt"; then
_extract_probe_urls ".tmp/web_full_info.txt" "$domain" ".tmp/probed_tmp.txt" || true
urls_extracted=$(awk 'NF {c++} END {print c+0}' .tmp/probed_tmp.txt 2>/dev/null)
log_note "webprobe_simple: fallback to .tmp/web_full_info.txt urls_extracted=${urls_extracted}" "${FUNCNAME[0]}" "${LINENO}"
fi

log_note "webprobe_simple: probe_input_lines=${probe_input_lines} urls_extracted=${urls_extracted:-0} probe_is_json=${probe_is_json}" "${FUNCNAME[0]}" "${LINENO}"

# Adaptive throttling heuristics: mark slow hosts (429/403) from httpx
if [[ -s "webs/web_full_info.txt" ]]; then
Expand Down Expand Up @@ -253,26 +292,47 @@ function webprobe_full() {
fi
fi

# Process web_full_info_uncommon.txt
if [[ -s ".tmp/web_full_info_uncommon.txt" ]]; then
# Extract URLs
jq -r 'try .url' .tmp/web_full_info_uncommon.txt 2>/dev/null \
| grep "$domain" \
| grep -aEo 'https?://[^ ]+' \
| sed 's/*.//' \
| anew_q_safe .tmp/probed_uncommon_ports_tmp.txt
# Process web_full_info_uncommon.txt
if [[ -s ".tmp/web_full_info_uncommon.txt" ]]; then
local uncommon_is_json uncommon_input_lines uncommon_urls_extracted
uncommon_is_json=false
if _probe_output_is_json ".tmp/web_full_info_uncommon.txt"; then
uncommon_is_json=true
fi
uncommon_input_lines=$(awk 'NF {c++} END {print c+0}' .tmp/web_full_info_uncommon.txt 2>/dev/null)

# Extract plain web info
jq -r 'try . | "\(.url) [\(.status_code)] [\(.title)] [\(.webserver)] \(.tech)"' .tmp/web_full_info_uncommon.txt \
| grep "$domain" \
| anew_q_safe webs/web_full_info_uncommon_plain.txt
: >.tmp/probed_uncommon_ports_tmp.txt 2>/dev/null || true
_extract_probe_urls ".tmp/web_full_info_uncommon.txt" "$domain" ".tmp/probed_uncommon_ports_tmp.txt" || true

# Update webs_full_info_uncommon.txt based on whether domain is IP
if [[ $domain =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
cat .tmp/web_full_info_uncommon.txt 2>>"$LOGFILE" | anew_q_safe webs/web_full_info_uncommon.txt
else
grep "$domain" .tmp/web_full_info_uncommon.txt | anew_q_safe webs/web_full_info_uncommon.txt
fi
if [[ "$uncommon_is_json" != true ]]; then
log_note "webprobe_full: probe output not JSON; treating as URL list" "${FUNCNAME[0]}" "${LINENO}"
awk -v dom="$domain" 'index($0, dom) && $0 ~ /^https?:\/\// {print}' .tmp/web_full_info_uncommon.txt 2>/dev/null \
| _normalize_probe_urls \
| anew_q_safe webs/web_full_info_uncommon.txt
fi

if [[ "$uncommon_is_json" == true ]]; then
# Extract plain web info
jq -r 'try . | "\(.url) [\(.status_code)] [\(.title)] [\(.webserver)] \(.tech)"' .tmp/web_full_info_uncommon.txt \
| grep "$domain" \
| anew_q_safe webs/web_full_info_uncommon_plain.txt

# Update webs_full_info_uncommon.txt based on whether domain is IP
if [[ $domain =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
cat .tmp/web_full_info_uncommon.txt 2>>"$LOGFILE" | anew_q_safe webs/web_full_info_uncommon.txt
else
grep "$domain" .tmp/web_full_info_uncommon.txt | anew_q_safe webs/web_full_info_uncommon.txt
fi
fi
uncommon_urls_extracted=$(awk 'NF {c++} END {print c+0}' .tmp/probed_uncommon_ports_tmp.txt 2>/dev/null)

# Fallback: try prior uncommon cache when current extraction yields nothing.
if [[ "${uncommon_urls_extracted:-0}" -eq 0 ]] && [[ -s "webs/web_full_info_uncommon.txt" ]]; then
_extract_probe_urls "webs/web_full_info_uncommon.txt" "$domain" ".tmp/probed_uncommon_ports_tmp.txt" || true
uncommon_urls_extracted=$(awk 'NF {c++} END {print c+0}' .tmp/probed_uncommon_ports_tmp.txt 2>/dev/null)
log_note "webprobe_full: fallback to webs/web_full_info_uncommon.txt urls_extracted=${uncommon_urls_extracted}" "${FUNCNAME[0]}" "${LINENO}"
fi
log_note "webprobe_full: probe_input_lines=${uncommon_input_lines} urls_extracted=${uncommon_urls_extracted:-0} probe_is_json=${uncommon_is_json}" "${FUNCNAME[0]}" "${LINENO}"

# Count new websites
if ! NUMOFLINES=$(anew_safe webs/webs_uncommon_ports.txt <.tmp/probed_uncommon_ports_tmp.txt | sed '/^$/d' | wc -l); then
Expand Down Expand Up @@ -2007,7 +2067,7 @@ function wordlist_gen() {

start_func "${FUNCNAME[0]}" "Wordlist Generation"

[[ -s ".tmp/url_extract_tmp.txt" ]] && cat webs/url_extract.txt | anew -q .tmp/url_extract_tmp.txt || true
[[ -s ".tmp/url_extract_tmp.txt" ]] && [[ -s "webs/url_extract.txt" ]] && cat webs/url_extract.txt | anew -q .tmp/url_extract_tmp.txt || true
# Ensure url_extract_tmp.txt exists and is not empty
if [[ -s ".tmp/url_extract_tmp.txt" ]]; then
# Define patterns for keys and values
Expand Down
7 changes: 7 additions & 0 deletions reconftw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ while true; do
fi
while IFS= read -r t; do
[[ -z "$t" ]] && continue
t=$(_sanitize_list_entry "$t") || continue
ipcidr_target "$t" "$list"
done <"$list"
shift 2
Expand Down Expand Up @@ -621,6 +622,7 @@ case $opt_mode in
sed_i 's/\r$//' "$flist"
while IFS= read -r domain <&3; do
[[ -z "$domain" ]] && continue
domain=$(_sanitize_list_entry "$domain") || continue
start
recon
end
Expand All @@ -642,6 +644,7 @@ case $opt_mode in
sed_i 's/\r$//' "$flist"
while IFS= read -r domain <&3; do
[[ -z "$domain" ]] && continue
domain=$(_sanitize_list_entry "$domain") || continue
subs_menu
done 3<"$flist"
else
Expand All @@ -656,6 +659,7 @@ case $opt_mode in
sed_i 's/\r$//' "$flist"
while IFS= read -r domain <&3; do
[[ -z "$domain" ]] && continue
domain=$(_sanitize_list_entry "$domain") || continue
passive
done 3<"$flist"
else
Expand All @@ -671,6 +675,7 @@ case $opt_mode in
sed_i 's/\r$//' "$flist"
while IFS= read -r domain <&3; do
[[ -z "$domain" ]] && continue
domain=$(_sanitize_list_entry "$domain") || continue
all
done 3<"$flist"
else
Expand Down Expand Up @@ -702,6 +707,7 @@ case $opt_mode in
sed_i 's/\r$//' "$flist"
while IFS= read -r domain <&3; do
[[ -z "$domain" ]] && continue
domain=$(_sanitize_list_entry "$domain") || continue
start
osint
end
Expand All @@ -720,6 +726,7 @@ case $opt_mode in
sed_i 's/\r$//' "$flist"
while IFS= read -r domain <&3; do
[[ -z "$domain" ]] && continue
domain=$(_sanitize_list_entry "$domain") || continue
zen_menu
done 3<"$flist"
else
Expand Down
94 changes: 94 additions & 0 deletions tests/unit/test_webprobe_full_formats.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env bats

setup() {
local project_root
project_root="$(cd "$(dirname "$BATS_TEST_FILENAME")/../.." && pwd)"
export SCRIPTPATH="$project_root"
export LOGFILE="/dev/null"
export bred='' bblue='' bgreen='' byellow='' yellow='' reset=''

export TEST_DIR="$BATS_TEST_TMPDIR/reconftw_webprobe_full"
mkdir -p "$TEST_DIR"
export dir="$TEST_DIR/example.com"
export called_fn_dir="$dir/.called_fn"
mkdir -p "$called_fn_dir" "$dir"
cd "$dir"

export MOCK_BIN="$TEST_DIR/mockbin"
mkdir -p "$MOCK_BIN"
export PATH="$MOCK_BIN:$PATH"

source "$project_root/reconftw.sh" --source-only
export domain="example.com"
export DIFF=false
export AXIOM=false
export WEBPROBEFULL=true
export PROXY=false
export UNCOMMON_PORTS_WEB="8080,8443"
export HTTPX_UNCOMMONPORTS_THREADS=10
export HTTPX_UNCOMMONPORTS_TIMEOUT=10
}

teardown() {
[[ -d "$TEST_DIR" ]] && rm -rf "$TEST_DIR"
}

@test "webprobe_full accepts URL-list output and updates uncommon/webs_all targets" {
mkdir -p .tmp webs subdomains
printf "a.example.com\n" > subdomains/subdomains.txt

cat > "$MOCK_BIN/httpx" <<'SH'
#!/usr/bin/env bash
out=""
while [[ $# -gt 0 ]]; do
case "$1" in
-o)
out="$2"
shift 2
;;
*)
shift
;;
esac
done
printf '%s\n' "https://*.edge.example.com:8443" "https://api.example.com:8080" > "$out"
SH
chmod +x "$MOCK_BIN/httpx"

run webprobe_full
[ "$status" -eq 0 ]
[ -s "webs/webs_uncommon_ports.txt" ]
grep -q "https://edge.example.com:8443" "webs/webs_uncommon_ports.txt"
grep -q "https://api.example.com:8080" "webs/webs_uncommon_ports.txt"
[ -s "webs/webs_all.txt" ]
grep -q "https://edge.example.com:8443" "webs/webs_all.txt"
}

@test "webprobe_full falls back to cached uncommon output when current extraction is empty" {
mkdir -p .tmp webs subdomains
printf "a.example.com\n" > subdomains/subdomains.txt
printf '%s\n' "https://cached.example.com:8443" > webs/web_full_info_uncommon.txt

cat > "$MOCK_BIN/httpx" <<'SH'
#!/usr/bin/env bash
out=""
while [[ $# -gt 0 ]]; do
case "$1" in
-o)
out="$2"
shift 2
;;
*)
shift
;;
esac
done
printf '%s\n' "https://not-in-scope.invalid:8443" > "$out"
SH
chmod +x "$MOCK_BIN/httpx"

run webprobe_full
[ "$status" -eq 0 ]
[ -s "webs/webs_uncommon_ports.txt" ]
grep -q "https://cached.example.com:8443" "webs/webs_uncommon_ports.txt"
}
Loading