Too Remote to Be Local
Latency Inflation in Anycast due to Remote Peering
This repository contains the code and data for the CNSM'25 paper "Too Remote to Be Local: Latency Inflation in Anycast due to Remote Peering".
Code Contributors¶
imports and global variables (see requirements.txt for dependencies)¶
In [38]:
# !pip install -r requirements.txt
# Standard library
import os
import sys
import io
import time
import json
import gzip
import subprocess
import urllib.request
import urllib.error
import urllib.parse
from math import radians
from collections import Counter
import ipaddress
# Third-party libraries
import pandas as pd
import numpy as np
import duckdb
import requests
from bs4 import BeautifulSoup
import radix
from sklearn.neighbors import BallTree
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import geopandas as gpd
from shapely.geometry import Point
# matplotlib params
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams.update({'font.size': 20})
# constants
SPEED_IN_FIBRE = 299_792 / 1.467 # 1.467 refraction index fibre optics https://www.thorlabs.com/newgrouppage9.cfm?objectgroup_id=949
EARTH_RADIUS_KM = 6371 # Radius of Earth
def haversine(lat1, lon1, lat2, lon2):
"""
Calculate the great circle distance between two points.
:param lat1: Latitude of first point.
:param lon1: Longitude of first point.
:param lat2: Latitude of second point.
:param lon2: Longitude of second point.
:return:
"""
# Convert decimal degrees to radians
lat1 = pd.to_numeric(lat1, errors='coerce')
lon1 = pd.to_numeric(lon1, errors='coerce')
lat2 = pd.to_numeric(lat2, errors='coerce')
lon2 = pd.to_numeric(lon2, errors='coerce')
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
# Haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
# Use arctan2 for numerical stability
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
distance_km = EARTH_RADIUS_KM * c
return distance_km
Read in data and combine with external datasets¶
In [3]:
traceroutes_df = pd.read_parquet('./data/traces.parquet', engine='pyarrow') # engine='fastparquet'
# filter out 1.0.0.0 as the parquet has 100+ traceroutes from each VP for this destination due to a measurement error
traceroutes_df = traceroutes_df[traceroutes_df['dst'] != '1.0.0.0']
traceroutes_df.head()
Out[3]:
| tx_hostname | dst | hop_count | probe_count | hop_addr | hop_probe_ttl | hop_rtt | hop_reply_ttl | |
|---|---|---|---|---|---|---|---|---|
| 5 | aep3-ar.ark.caida.org | 100.0.0.1 | 12 | 17 | 192.168.1.1 | 1 | 1.227 | 64 |
| 6 | aep3-ar.ark.caida.org | 100.0.0.1 | 12 | 17 | 192.168.0.1 | 2 | 1.485 | 63 |
| 7 | aep3-ar.ark.caida.org | 100.0.0.1 | 12 | 17 | 181.96.62.70 | 7 | 12.157 | 249 |
| 8 | aep3-ar.ark.caida.org | 100.0.0.1 | 12 | 17 | 195.22.220.58 | 8 | 10.412 | 248 |
| 9 | aep3-ar.ark.caida.org | 100.0.0.1 | 12 | 17 | 89.221.41.221 | 9 | 147.013 | 245 |
In [4]:
# drop hop_count, probe_count, hop_reply_ttl (we do not use these for our analysis)
traceroutes_df = traceroutes_df.drop(columns=['hop_count', 'probe_count', 'hop_reply_ttl'])
In [5]:
# # extract hop_name column for retrieving OI data
#
# unique_hop_addr = traceroutes_df['hop_addr'].dropna().unique()
# pd.DataFrame(unique_hop_addr).to_csv('./data/unique_hops.csv', index=False, header=False)
In [7]:
# add OpenINTEL PTR recrods to traceroute data
hop_names = pd.read_csv('./data/ptr_lookup.csv.gz')
traceroutes_df = pd.merge(traceroutes_df, hop_names, how='left', left_on='hop_addr', right_on='ip4_address')
# rename ptr_name to hop_name
traceroutes_df = traceroutes_df.rename(columns={'ptr_name': 'hop_name'})
In [8]:
# read in locations of Ark VPs
ark_locs = pd.read_csv('./data/arklocs.txt', header=None, names=['tx_hostname', 'vp_lat', 'vp_lon', 'tx_city', 'vp_asn'])
ark_locs['tx_hostname'] = ark_locs['tx_hostname'].str.split(".ark.caida.org").str[0]
ark_locs['vp_asn'] = ark_locs['vp_asn'].apply(lambda x: '-' if pd.isna(x) else str(int(float(x))))
ark_locs['tx_airport'] = ark_locs['tx_hostname'].str[:3]
ark_locs.head()
Out[8]:
| tx_hostname | vp_lat | vp_lon | tx_city | vp_asn | tx_airport | |
|---|---|---|---|---|---|---|
| 0 | hlz2-nz | -37.79 | 175.28 | Hamilton | 9500 | hlz |
| 1 | hkg4-cn | 22.36 | 114.12 | Hong Kong | 212238 | hkg |
| 2 | lax4-us | 33.92 | -118.39 | Los Angeles | 212238 | lax |
| 3 | bfi-us | 47.61 | -122.33 | Seattle | 209 | bfi |
| 4 | drs-de | 51.03 | 13.73 | Dresden | 680 | drs |
In [9]:
# add ark locations
traceroutes_df['tx_hostname'] = traceroutes_df['tx_hostname'].str.split(".ark.caida.org").str[0]
traceroutes_df = pd.merge(traceroutes_df, ark_locs, on='tx_hostname', how='left')
del ark_locs
traceroutes_df.head()
Out[9]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | vp_asn | tx_airport | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar. | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep |
exploratory data analysis¶
In [10]:
# how many traceroutes?
len(traceroutes_df)
Out[10]:
26717905
In [11]:
# how many VPs participated?
traceroutes_df['tx_hostname'].nunique()
Out[11]:
274
In [12]:
# how many ASes are these in?
traceroutes_df['vp_asn'].nunique()
Out[12]:
178
In [13]:
# how many targets did we measure?
traceroutes_df['dst'].nunique()
Out[13]:
13734
In [14]:
# how many tx_hostname, dst pairs (i.e., how many traceroutes?)
traceroutes_df[['tx_hostname', 'dst']].drop_duplicates().shape[0]
Out[14]:
3353799
In [15]:
# what is the average number of VPs that completed a traceroute towards a target
traceroutes_df.groupby('dst')['tx_hostname'].nunique().mean()
Out[15]:
244.19681083442552
In [16]:
# what is the average number of hops captured?
traceroutes_df.groupby(['tx_hostname', 'dst']).size().mean()
Out[16]:
7.966459826602608
In [17]:
# how many unique hop addresses?
traceroutes_df['hop_addr'].nunique()
Out[17]:
99391
In [18]:
# how many unique hop addresses have PTR records?
traceroutes_df[~traceroutes_df['hop_name'].isna()]['hop_addr'].nunique()
Out[18]:
45106
In [19]:
# how many hops have PTR records?
(~traceroutes_df['hop_name'].isna()).sum()
Out[19]:
14131285
In [20]:
# how many hops do not have PTR records?
traceroutes_df['hop_name'].isna().sum()
Out[20]:
12586620
In [21]:
# how many unique PTR records did we observe?
traceroutes_df['hop_name'].nunique()
Out[21]:
41758
Calculate distances and RTTS¶
In [22]:
# get RTT to the PoP for each traceroute
dst_hops_df = traceroutes_df[traceroutes_df['dst'] == traceroutes_df['hop_addr']]
dst_hops_df = dst_hops_df[['tx_hostname', 'dst', 'hop_rtt']].rename(
columns={'hop_rtt': 'pop_rtt'}
)
dst_hops_df = dst_hops_df.sort_values(by='pop_rtt').drop_duplicates(subset=['tx_hostname', 'dst']) # remove duplicate RTTs observed between an (Ark, PoP) pair -> keep the lowest RTT
traceroutes_df = pd.merge(traceroutes_df, dst_hops_df, on=['tx_hostname', 'dst'], how='left')
In [23]:
def rtt_to_distance(rtt_ms):
one_way_time_s = (rtt_ms / 10_00) / 2 # to seconds and divide by 2 to get one-way latency
distance_km = one_way_time_s * SPEED_IN_FIBRE
return distance_km
In [24]:
# calculate distance between VP and hop
traceroutes_df['ark_to_hop'] = traceroutes_df['hop_rtt'].apply(rtt_to_distance)
# calculate distance between VP and PoP
traceroutes_df['ark_to_pop'] = traceroutes_df['pop_rtt'].apply(rtt_to_distance)
# calculate distance between hop and PoP
#traceroutes_df['hop_to_pop'] = np.maximum(0, result_df['ark_to_pop'] - result_df['ark_to_hop']) # this value is meaningless as the delta RTT between the two is unreliable
identify bogon hops (['is_bogon'])¶
In [25]:
def ipv4_to_uint32(ip_str):
"""Converts an IPv4 string to an unsigned 32-bit integer."""
if ip_str is None:
return None
try:
# Convert to IPv4Address object first to validate
ip_obj = ipaddress.ip_address(ip_str)
if ip_obj.version == 4:
return int(ip_obj) # Returns the integer representation
else:
return None # Not an IPv4 address
except ValueError:
# Handle cases where the string is not a valid IP address at all
return None
In [27]:
# Get bogon list from Cymru https://www.team-cymru.com/bogon-networks
bogon_df = pd.read_csv('./data/2025-04-19.fullbogons-ipv4.csv')
bogon_ranges_cidr = bogon_df['Bogons'].tolist()
# create list of ranges
bogon_int_ranges = []
for cidr in bogon_ranges_cidr:
network = ipaddress.ip_network(cidr, strict=False)
start_int = int(network.network_address)
end_int = int(network.broadcast_address)
bogon_int_ranges.append((start_int, end_int))
del bogon_df
In [28]:
# create u32 of hop
ip_int_series = traceroutes_df['hop_addr'].apply(ipv4_to_uint32)
bogon_mask = pd.Series(False, index=ip_int_series.index)
for start_int, end_int in bogon_int_ranges:
# Series.between is inclusive by default (both start and end)
in_range_mask = ip_int_series.between(start_int, end_int, inclusive='both')
bogon_mask |= in_range_mask # Apply OR operation
traceroutes_df['hop_is_bogon'] = bogon_mask
del ip_int_series
traceroutes_df.head()
Out[28]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | vp_asn | tx_airport | pop_rtt | ark_to_hop | ark_to_pop | hop_is_bogon | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep | 175.665 | 125.373137 | 17949.203027 | True |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep | 175.665 | 151.735215 | 17949.203027 | True |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar. | -34.6 | -58.38 | Buenos Aires | 7303 | aep | 175.665 | 1242.185189 | 17949.203027 | False |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep | 175.665 | 1063.883539 | 17949.203027 | False |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | 7303 | aep | 175.665 | 15021.581900 | 17949.203027 | False |
load in ipinfo data¶
In [29]:
# load in ipinfo data (weekly snapshots) (accessible through IPInfo)
# load data in duck DB
db_file = './data/ip_locations_ipv4.duckdb'
table_name = 'locations_ipv4'
json_file = './data/2025-03-31_standard_location.json.gz' # monthly snapshot
# if db_file exists, we are done
if os.path.exists(db_file):
print(f"Database file {db_file} already exists. Skipping loading.")
else:
print(f"Loading IPv4 ipinfo data from {json_file} into DuckDB table {table_name}")
start_time = time.time()
# create the database file
con = duckdb.connect(database=db_file, read_only=False)
# add ipv4 translation function to duckdb
con.create_function('ipv4_to_uint32_udf', ipv4_to_uint32, [duckdb.typing.VARCHAR], duckdb.typing.UINTEGER)
# create table schema
con.execute(f"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name} (
start_ip_str VARCHAR,
end_ip_str VARCHAR,
join_key VARCHAR,
city VARCHAR,
region VARCHAR,
country VARCHAR,
latitude DOUBLE,
longitude DOUBLE,
postal_code VARCHAR,
timezone VARCHAR,
start_ip_uint32 UINTEGER,
end_ip_uint32 UINTEGER
);
""")
# load in data (excluding ipv6 addresses)
try:
con.execute(f"""
INSERT INTO {table_name} (
start_ip_str, end_ip_str, join_key, city, region, country,
latitude, longitude, postal_code, timezone,
start_ip_uint32, end_ip_uint32
)
SELECT
start_ip,
end_ip,
join_key,
city,
region,
country,
CAST(latitude AS DOUBLE),
CAST(longitude AS DOUBLE),
postal_code,
timezone,
ipv4_to_uint32_udf(start_ip) AS start_ip_u32,
ipv4_to_uint32_udf(end_ip) AS end_ip_u32
FROM read_json_auto('{json_file}', filename=true)
WHERE
-- filter out non-ipv4 addresses
start_ip LIKE '%.%.%.%' AND start_ip NOT LIKE '%:%' AND
end_ip LIKE '%.%.%.%' AND end_ip NOT LIKE '%:%'
""")
except Exception as e:
print(f"An error occurred during INSERT: {e}")
con.close()
raise
print("Creating index on IP ranges...")
con.execute(f"CREATE INDEX ip_range_idx_u32 ON {table_name} (start_ip_uint32, end_ip_uint32);")
con.close()
end_time = time.time()
print(f"IPInfo (IPV4) Database creation took: {end_time - start_time:.2f} seconds")
Loading IPv4 ipinfo data from ./data/2025-03-31_standard_location.json.gz into DuckDB table locations_ipv4
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
Creating index on IP ranges...
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
IPInfo (IPV4) Database creation took: 294.64 seconds
In [30]:
# get unique non-bogon addresses
unique_addr = traceroutes_df[~traceroutes_df['hop_is_bogon']]['hop_addr'].dropna().unique()
print(f"Found {len(unique_addr)} unique hop addresses out of {len(traceroutes_df)} total hops.")
# get u32 values for lookup
addr_data = []
for hop in unique_addr:
ip_uint32 = ipv4_to_uint32(hop)
if ip_uint32 is not None: # Keep only valid IPv4 conversions
addr_data.append({'hop_addr': hop, 'hop_addr_uint32': ip_uint32})
unique_addr_df = pd.DataFrame(addr_data)
unique_addr_df['hop_addr_uint32'] = unique_addr_df['hop_addr_uint32'].astype('uint32')
del unique_addr
del addr_data
Found 93110 unique hop addresses out of 26717905 total hops.
In [31]:
columns_to_load = [
'city',
'latitude',
'longitude',
'start_ip_uint32',
'end_ip_uint32'
]
select_cols_str = ", ".join(columns_to_load)
dtypes_to_load = {
'city': 'string',
'latitude': 'float32',
'longitude': 'float32',
'start_ip_uint32': 'uint32',
'end_ip_uint32': 'uint32'
}
print(f"Loading selected columns from {table_name} into Pandas...")
start_load_time = time.time()
con = None
try:
con = duckdb.connect(database=db_file, read_only=True)
print(" Connected to DuckDB.")
# cast df
select_expressions = []
for col in columns_to_load:
dtype = dtypes_to_load.get(col)
if dtype:
# data types
sql_type = dtype.replace('string', 'VARCHAR') \
.replace('float32', 'FLOAT') \
.replace('float64', 'DOUBLE') \
.replace('uint32', 'UINTEGER') \
.replace('Int64', 'BIGINT')
select_expressions.append(f"CAST({col} AS {sql_type}) AS {col}")
else:
select_expressions.append(col)
select_query_cols_str = ",\n ".join(select_expressions)
query = f"""
SELECT
{select_query_cols_str}
FROM {table_name}
WHERE start_ip_uint32 IS NOT NULL -- exclude NaN values
ORDER BY start_ip_uint32 ASC; -- sort by start_ip_uint32
"""
ipinfo_df = con.execute(query).fetchdf()
print(f"Successfully loaded {len(ipinfo_df)} rows.")
except Exception as e:
print(f"An error occurred loading data from DuckDB: {e}")
finally:
if con:
con.close()
print(" DuckDB connection closed.")
end_load_time = time.time()
print(f"\nLoading and initial sorting took: {end_load_time - start_load_time:.2f} seconds")
print("\nLoaded ip2location DataFrame Info:")
ipinfo_df.info(memory_usage='deep') # Check memory usage
Loading selected columns from locations_ipv4 into Pandas... Connected to DuckDB.
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
Successfully loaded 27274159 rows. DuckDB connection closed. Loading and initial sorting took: 9.88 seconds Loaded ip2location DataFrame Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 27274159 entries, 0 to 27274158 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 city object 1 latitude float32 2 longitude float32 3 start_ip_uint32 uint32 4 end_ip_uint32 uint32 dtypes: float32(2), object(1), uint32(2) memory usage: 2.1 GB
In [32]:
# unique hop addresses df
unique_addr_df = unique_addr_df.sort_values('hop_addr_uint32').reset_index(drop=True)
# add location information
unique_addr_df = pd.merge_asof(
unique_addr_df,
ipinfo_df,
left_on='hop_addr_uint32',
right_on='start_ip_uint32',
direction='backward'
)
In [33]:
cols_to_merge = ['hop_addr', 'city', 'latitude', 'longitude']
# add location data to full dataset
traceroutes_df = pd.merge(
traceroutes_df,
unique_addr_df[cols_to_merge],
on='hop_addr',
how='left'
)
rename_map = {
"city": "ipinfo_city",
"latitude": "ipinfo_lat",
"longitude": "ipinfo_lon"
}
traceroutes_df = traceroutes_df.rename(columns=rename_map)
In [34]:
# how many hops have IPInfo locations?
(~traceroutes_df['ipinfo_city'].isna()).sum()
Out[34]:
23973174
In [35]:
# how many hops have no IPInfo locations (i.e., bogons)?
traceroutes_df['ipinfo_city'].isna().sum()
Out[35]:
2744731
PTR record translation (Hoiho)¶
In [38]:
# requires JSON:XS
# configuration files
ptr_records_file = './data/ptr_records.csv.gz'
hoiho_script = './data/hoiho/hoiho-apply.pl'
hoiho_db = './data/hoiho/2024-08.midar-iff.geo-re.jsonl.gz'
# store unique PTR records
print(f"\nSaving unique PTR records to {ptr_records_file}...")
traceroutes_df['hop_name'] = traceroutes_df['hop_name'].str.rstrip('.') # OI data has a trailing dot
unique_hop_names = traceroutes_df['hop_name'].unique()
pd.DataFrame(unique_hop_names).to_csv(ptr_records_file, index=False, header=False)
print(f"Saved {len(unique_hop_names)} unique names.")
process = False # flip to run perl script manually
if process:
# Check if required files/scripts exist before running
if not os.path.exists(hoiho_script):
print(f"ERROR: Hoiho script not found at '{hoiho_script}'")
elif not os.path.exists(hoiho_db):
print(f"ERROR: Hoiho database not found at '{hoiho_db}'")
elif not os.path.exists(ptr_records_file):
print(f"ERROR: Input PTR file not found at '{ptr_records_file}'")
else:
print(f"\nExecuting Hoiho Perl script: {hoiho_script}...")
command = f"cat {ptr_records_file} | perl {hoiho_script} {hoiho_db}"
try:
# run perl command
process_result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
check=True
)
# Get the standard output from the completed process
ptr_translations_content = process_result.stdout
print(f"Perl script executed successfully. Output length: {len(ptr_translations_content)} chars.")
# Check for errors printed to stderr
if process_result.stderr:
print("\n--- Warnings/Errors from Perl script (stderr) ---")
print(process_result.stderr)
print("-------------------------------------------------")
print("\nLoading PTR translations from script output...")
if ptr_translations_content:
ptr_map = pd.read_csv(
io.StringIO(ptr_translations_content),
sep=' ',
on_bad_lines='skip',
names=['hop_name', 'location', 'code', 'ptr_lat', 'ptr_lon', 'ptr_city'],
skipinitialspace=True
)
print(f"Loaded {len(ptr_map)} translation entries.")
else:
print("Warning: Perl script produced no output. PTR map will be empty.")
ptr_map = pd.DataFrame(columns=['hop_name', 'location', 'code', 'ptr_lat', 'ptr_lon', 'ptr_city'])
ptr_map.to_csv('./data/hoiho_translations.csv.gz', index=False)
except FileNotFoundError:
print(f"ERROR: 'perl' command not found. Is Perl installed and in your system's PATH?")
except subprocess.CalledProcessError as e:
print(f"ERROR: Perl script execution failed with exit code {e.returncode}.")
print("--- Perl script STDOUT ---")
print(e.stdout)
print("--- Perl script STDERR ---")
print(e.stderr)
except Exception as e:
print(f"An unexpected error occurred during subprocess execution: {e}")
else:
# Read records (instead of running perl script)
ptr_map = pd.read_csv('./data/hoiho_translations.csv.gz')
print("\nMerging PTR translations into final DataFrame...")
cols_to_merge = ['hop_name', 'ptr_lat', 'ptr_lon', 'ptr_city']
traceroutes_df = pd.merge(
traceroutes_df,
ptr_map[cols_to_merge],
on='hop_name',
how='left'
)
print("Merge complete.")
Saving unique PTR records to ./data/ptr_records.csv.gz... Saved 41759 unique names. Merging PTR translations into final DataFrame... Merge complete.
In [39]:
# how many PTR records have a city location?
(~ptr_map['ptr_city'].isna()).sum()
Out[39]:
16565
In [40]:
# how many PTR records have no city location?
ptr_map['ptr_city'].isna().sum()
Out[40]:
3048
In [41]:
# how many hops have PTR city locations?
(~traceroutes_df['ptr_city'].isna()).sum()
Out[41]:
3301373
In [42]:
# how many hops have NO PTR city locations?
(traceroutes_df['ptr_city'].isna()).sum()
Out[42]:
23416532
Validate hop locations¶
In [43]:
# loc_valid
# calculate distance between 'vp_lat', 'vp_lon' and 'latitude', 'longitude'
# verify distance is smaller than 'ark_to_hop' (which is in km)
### verify IPinfo locations
# calculate distance
traceroutes_df['calculated_distance_km'] = haversine(
traceroutes_df['vp_lat'],
traceroutes_df['vp_lon'],
traceroutes_df['ipinfo_lat'],
traceroutes_df['ipinfo_lon']
)
# distance between points must be less than the maximum distance travelled given the latency difference
traceroutes_df['ipinfo_loc_valid'] = traceroutes_df['calculated_distance_km'] < traceroutes_df['ark_to_hop']
### verify PTR record locations
traceroutes_df['calculated_distance_km'] = haversine(
traceroutes_df['vp_lat'],
traceroutes_df['vp_lon'],
traceroutes_df['ptr_lat'],
traceroutes_df['ptr_lon']
)
traceroutes_df['ptr_loc_valid'] = traceroutes_df['calculated_distance_km'] < traceroutes_df['ark_to_hop']
traceroutes_df.drop(columns=['calculated_distance_km'], inplace=True)
# ptr valids
valid_count = traceroutes_df['ptr_loc_valid'].sum()
total_count = (~traceroutes_df['ptr_city'].isna()).sum()
print(f"PTR locations {valid_count} valid out of {total_count}")
# ipinfo valids
valid_count = traceroutes_df['ipinfo_loc_valid'].sum()
total_count = (~traceroutes_df['ipinfo_city'].isna()).sum()
print(f"IPInfo locations {valid_count} valid out of {total_count}")
either_valid = (traceroutes_df['ptr_loc_valid'] | traceroutes_df['ipinfo_loc_valid']).sum()
print(f"Locations with either PTR or IPInfo valid locations: {either_valid}")
neither_valid = (~traceroutes_df['ptr_loc_valid'] & ~traceroutes_df['ipinfo_loc_valid']).sum()
print(f"Locations with neither PTR nor IPInfo valid locations: {neither_valid}")
PTR locations 3243374 valid out of 3301373 IPInfo locations 20374390 valid out of 23973174 Locations with either PTR or IPInfo valid locations: 20454038 Locations with neither PTR nor IPInfo valid locations: 6263867
In [44]:
# performing global invalidation
invalid_ipinfo_ips = set(
traceroutes_df.loc[traceroutes_df['ipinfo_loc_valid'] == False, 'hop_addr'].unique()
)
invalid_ptr_ips = set(
traceroutes_df.loc[traceroutes_df['ptr_loc_valid'] == False, 'hop_addr'].unique()
)
original_ipinfo_valid_count = traceroutes_df['ipinfo_loc_valid'].sum()
traceroutes_df.loc[traceroutes_df['hop_addr'].isin(invalid_ipinfo_ips), 'ipinfo_loc_valid'] = False
new_ipinfo_valid_count = traceroutes_df['ipinfo_loc_valid'].sum()
original_ptr_valid_count = traceroutes_df['ptr_loc_valid'].sum()
traceroutes_df.loc[traceroutes_df['hop_addr'].isin(invalid_ptr_ips), 'ptr_loc_valid'] = False
new_ptr_valid_count = traceroutes_df['ptr_loc_valid'].sum()
print(f"Global invalidation changed {original_ipinfo_valid_count - new_ipinfo_valid_count} IPInfo locations from valid to invalid.")
print(f"Global invalidation changed {original_ptr_valid_count - new_ptr_valid_count} PTR locations from valid to invalid.")
Global invalidation changed 1496390 IPInfo locations from valid to invalid. Global invalidation changed 35576 PTR locations from valid to invalid.
In [45]:
# ptr valids
valid_count = traceroutes_df['ptr_loc_valid'].sum()
total_count = (~traceroutes_df['ptr_city'].isna()).sum()
print(f"PTR locations {valid_count} valid out of {total_count}")
# ipinfo valids
valid_count = traceroutes_df['ipinfo_loc_valid'].sum()
total_count = (~traceroutes_df['ipinfo_city'].isna()).sum()
print(f"IPInfo locations {valid_count} valid out of {total_count}")
either_valid = (traceroutes_df['ptr_loc_valid'] | traceroutes_df['ipinfo_loc_valid']).sum()
print(f"Locations with either PTR or IPInfo valid locations: {either_valid}")
neither_valid = (~traceroutes_df['ptr_loc_valid'] & ~traceroutes_df['ipinfo_loc_valid']).sum()
print(f"Locations with neither PTR nor IPInfo valid locations: {neither_valid}")
PTR locations 3207798 valid out of 3301373 IPInfo locations 18878000 valid out of 23973174 Locations with either PTR or IPInfo valid locations: 18962816 Locations with neither PTR nor IPInfo valid locations: 7755089
In [46]:
valid_count = traceroutes_df[traceroutes_df['ptr_loc_valid']]['hop_addr'].nunique()
total_count = traceroutes_df[(~traceroutes_df['ptr_city'].isna())]['hop_addr'].nunique()
print(f"Unique hops with PTR record inferred locations: {total_count} of which {valid_count} are valid")
# ipinfo valids
valid_count = traceroutes_df[traceroutes_df['ipinfo_loc_valid']]['hop_addr'].nunique()
total_count = traceroutes_df[(~traceroutes_df['ipinfo_city'].isna())]['hop_addr'].nunique()
print(f"Unique hops with IPinfo locations: {total_count} of which {valid_count} are valid")
either_valid = traceroutes_df[(traceroutes_df['ptr_loc_valid'] | traceroutes_df['ipinfo_loc_valid'])]['hop_addr'].nunique()
print(f"Unique hops with either PTR or IPInfo valid locations: {either_valid}")
both_invalid = traceroutes_df[((traceroutes_df['ptr_loc_valid'] == False) & (traceroutes_df['ipinfo_loc_valid'] == False))]['hop_addr'].nunique()
print(f"Unique hops with neither PTR nor IPInfo valid locations: {both_invalid}")
Unique hops with PTR record inferred locations: 16974 of which 16734 are valid Unique hops with IPinfo locations: 93110 of which 74469 are valid Unique hops with either PTR or IPInfo valid locations: 74876 Unique hops with neither PTR nor IPInfo valid locations: 24515
In [47]:
# get preferred locations (ptr if valid, else ipinfo if valid, else NaN)
conditions = [
traceroutes_df['ptr_loc_valid'] == True,
traceroutes_df['ipinfo_loc_valid'] == True
]
# For hop_city
city_choices = [
traceroutes_df['ptr_city'],
traceroutes_df['ipinfo_city']
]
# For hop_lat
lat_choices = [
traceroutes_df['ptr_lat'],
traceroutes_df['ipinfo_lat']
]
lon_choices = [
traceroutes_df['ptr_lon'],
traceroutes_df['ipinfo_lon']
]
# get hop location info
traceroutes_df['hop_city'] = np.select(conditions, city_choices, default=pd.NA)
traceroutes_df['hop_lat'] = np.select(conditions, lat_choices, default=np.nan)
traceroutes_df['hop_lon'] = np.select(conditions, lon_choices, default=np.nan)
traceroutes_df = traceroutes_df.drop(columns=
[
'ipinfo_city', 'ipinfo_lat', 'ipinfo_lon',
'ptr_city', 'ptr_lat', 'ptr_lon',
'ptr_loc_valid', 'ipinfo_loc_valid'
]
)
print(f"no locations available {(traceroutes_df['hop_city'].isna()).sum()}")
print(f"locations available {(~traceroutes_df['hop_city'].isna()).sum()}")
no locations available 7755101 locations available 18962804
In [48]:
# target addresses are anycast, where we cannot trust geolocation data
traceroutes_df.loc[
traceroutes_df['hop_addr'] == traceroutes_df['dst'],
['hop_lat', 'hop_lon']
] = np.nan
traceroutes_df.loc[
traceroutes_df['hop_addr'] == traceroutes_df['dst'],
['hop_city']
] = 'ANYCAST'
In [49]:
# convert city formats (Zürich -> Zurich) for consistency between Hoiho and IPInfo
traceroutes_df['hop_city'] = (
traceroutes_df['hop_city']
.str.normalize('NFD')
.str.encode('ascii', 'ignore')
.str.decode('utf-8')
.str.lower()
)
add USN regions¶
In [51]:
# load in region data
un_regions_df = pd.read_csv('./data/unsd_geography.csv', sep=';')
# We only need the code and the region names
un_regions_df = un_regions_df[['ISO-alpha3 Code', 'Region Name', 'Sub-region Name']]
un_regions_df.head()
Out[51]:
| ISO-alpha3 Code | Region Name | Sub-region Name | |
|---|---|---|---|
| 0 | DZA | Africa | Northern Africa |
| 1 | EGY | Africa | Northern Africa |
| 2 | LBY | Africa | Northern Africa |
| 3 | MAR | Africa | Northern Africa |
| 4 | SDN | Africa | Northern Africa |
In [52]:
# get world map using geopandas (converting coordinates to country)
world = gpd.read_file("https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip")
# We only need the country's ISO code and its geometric shape.
world = world[['ISO_A3', 'geometry']]
world.head()
Out[52]:
| ISO_A3 | geometry | |
|---|---|---|
| 0 | FJI | MULTIPOLYGON (((180 -16.06713, 180 -16.55522, ... |
| 1 | TZA | POLYGON ((33.90371 -0.95, 34.07262 -1.05982, 3... |
| 2 | ESH | POLYGON ((-8.66559 27.65643, -8.66512 27.58948... |
| 3 | CAN | MULTIPOLYGON (((-122.84 49, -122.97421 49.0025... |
| 4 | USA | MULTIPOLYGON (((-122.84 49, -120 49, -117.0312... |
In [53]:
# create lookup map (converting coordinates to UNSD region)
world_with_regions = world.merge(
un_regions_df,
left_on='ISO_A3',
right_on='ISO-alpha3 Code',
how='left'
)
world_with_regions.head()
Out[53]:
| ISO_A3 | geometry | ISO-alpha3 Code | Region Name | Sub-region Name | |
|---|---|---|---|---|---|
| 0 | FJI | MULTIPOLYGON (((180 -16.06713, 180 -16.55522, ... | FJI | Oceania | Melanesia |
| 1 | TZA | POLYGON ((33.90371 -0.95, 34.07262 -1.05982, 3... | TZA | Africa | Sub-Saharan Africa |
| 2 | ESH | POLYGON ((-8.66559 27.65643, -8.66512 27.58948... | ESH | Africa | Northern Africa |
| 3 | CAN | MULTIPOLYGON (((-122.84 49, -122.97421 49.0025... | CAN | Americas | Northern America |
| 4 | USA | MULTIPOLYGON (((-122.84 49, -120 49, -117.0312... | USA | Americas | Northern America |
In [54]:
# create geo dataframe for our coordinates
geometry = [Point(xy) for xy in zip(traceroutes_df['hop_lon'], traceroutes_df['hop_lat'])]
traceroutes_gdf = gpd.GeoDataFrame(
traceroutes_df,
crs="EPSG:4326", # WGS84 is the standard for GPS coordinates
geometry=geometry
)
In [55]:
# perform lookup (converting coordinates to country)
points_with_regions = gpd.sjoin(
traceroutes_gdf,
world_with_regions,
how="left",
predicate="within"
)
points_with_regions.head()
Out[55]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | hop_is_bogon | hop_city | hop_lat | hop_lon | geometry | index_right | ISO_A3 | ISO-alpha3 Code | Region Name | Sub-region Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | True | <NA> | NaN | NaN | POINT EMPTY | NaN | NaN | NaN | NaN | NaN |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | True | <NA> | NaN | NaN | POINT EMPTY | NaN | NaN | NaN | NaN | NaN |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | -34.6 | -58.38 | Buenos Aires | ... | False | buenos aires | -34.613152 | -58.377232 | POINT (-58.37723 -34.61315) | 9.0 | ARG | ARG | Americas | Latin America and the Caribbean |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | False | buenos aires | -34.613152 | -58.377232 | POINT (-58.37723 -34.61315) | 9.0 | ARG | ARG | Americas | Latin America and the Caribbean |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | False | miami | 25.774269 | -80.193657 | POINT (-80.19366 25.77427) | 4.0 | USA | USA | Americas | Northern America |
5 rows × 25 columns
In [56]:
# rename results
traceroutes_df = points_with_regions.rename(columns={
'Region Name': 'hop_region',
'Sub-region Name': 'hop_sub_region',
'ISO_A3': 'hop_country_code',
}).drop(columns=['ISO-alpha3 Code', 'index_right', 'geometry'])
del points_with_regions
traceroutes_df.head()
Out[56]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | pop_rtt | ark_to_hop | ark_to_pop | hop_is_bogon | hop_city | hop_lat | hop_lon | hop_country_code | hop_region | hop_sub_region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 175.665 | 125.373137 | 17949.203027 | True | <NA> | NaN | NaN | NaN | NaN | NaN |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 175.665 | 151.735215 | 17949.203027 | True | <NA> | NaN | NaN | NaN | NaN | NaN |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | -34.6 | -58.38 | Buenos Aires | ... | 175.665 | 1242.185189 | 17949.203027 | False | buenos aires | -34.613152 | -58.377232 | ARG | Americas | Latin America and the Caribbean |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 175.665 | 1063.883539 | 17949.203027 | False | buenos aires | -34.613152 | -58.377232 | ARG | Americas | Latin America and the Caribbean |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 175.665 | 15021.581900 | 17949.203027 | False | miami | 25.774269 | -80.193657 | USA | Americas | Northern America |
5 rows × 22 columns
Get ASN data¶
In [57]:
def find_day_url_v4():
#https://data.caida.org/datasets/routing/routeviews-prefix2as/2025/04/routeviews-rv2-20250418-1200.pfx2as.gz
base_url = 'http://data.caida.org/datasets/routing/routeviews-prefix2as/{}/{:02d}'.format(2025, 4)
daymask = '{}{:02d}{:02d}'.format(2025, 4, 19)
dayurl = None
try:
page = requests.get(base_url, timeout=10).text
soup = BeautifulSoup(page, 'html.parser')
for node in soup.find_all('a'):
href = node.get('href')
if href and href.endswith('pfx2as.gz'):
find_dayurl = f"{base_url}/{href}" # Construct full URL
if daymask in find_dayurl:
dayurl = find_dayurl
break
except requests.exceptions.RequestException as e:
print(f"Error fetching CAIDA directory page {base_url}: {e}")
sys.exit(99)
if dayurl is None:
print('Unable to find day-specific URL for {}'.format(base_url))
sys.exit(99)
return dayurl
In [58]:
# adds ASN and BGP prefix from CAIDA data to a csv with ipaddresses
def fetch_caida_data_as_radix():
caida_url = find_day_url_v4()
caida_radix = radix.Radix()
try:
sys.stdout.write('Retrieving {} ... '.format(caida_url))
sys.stdout.flush()
response = urllib.request.urlopen(caida_url)
gzFile = io.BytesIO()
gzFile.write(response.read())
print('OK')
except os.error as err:
print('Failed to download or parse {}'.format(caida_url))
print(err)
sys.exit(99)
gzFile.seek(0, os.SEEK_END)
print('Read {} bytes of compressed data'.format(gzFile.tell()))
gzFile.seek(0, os.SEEK_SET)
# Iterate over decompressed content
pfx2as = gzip.GzipFile(fileobj=gzFile, mode='rb')
pfx_count = 0
for line in pfx2as:
line = line.decode('utf-8')
line = line.rstrip('\n')
caida_vals = line.split('\t')
if len(caida_vals) != 3:
print('Invalid line in data retrieved from CAIDA ({})'.format(line))
prefix = '{}/{}'.format(caida_vals[0], caida_vals[1])
asn = caida_vals[2]
rnode = caida_radix.add(prefix)
rnode.data["AS"] = asn
pfx_count += 1
if (pfx_count % 100000) == 0:
sys.stdout.write('+')
sys.stdout.flush()
elif (pfx_count % 10000) == 0:
sys.stdout.write('-')
sys.stdout.flush()
pfx2as.close()
print()
print('Processed {} prefixes'.format(pfx_count))
return caida_radix
In [59]:
def lookup_asn(ip, lookup_radix4):
try:
rnode = lookup_radix4.search_best(ip)
if rnode:
return rnode.data.get('AS', '-')
else:
return '-' # not found
except Exception:
print(f"Radix lookup failed for invalid IP format: {ip}")
return '-' # invalid
In [62]:
# add missing ASNs using peeringdb
with gzip.open('data/caida/peeringdb_2_dump_2025_04_20.json.gz', 'rt', encoding='utf-8') as gzipped_file:
data = json.load(gzipped_file)
asn_ixp=[]
for i in data['netixlan']['data']:
asn=i['asn']
name=i['name']
address=i['ipaddr4']
asn_ixp.append((asn,name,address))
asn_ixp=pd.DataFrame(asn_ixp, columns=['asn', 'pop', 'addr'])
# convert ASN to string
asn_ixp['asn'] = asn_ixp['asn'].astype(str)
asn_ixp['asn'].str.replace(',', '_')
asn_ixp.head()
Out[62]:
| asn | pop | addr | |
|---|---|---|---|
| 0 | 8075 | Equinix New York | 198.32.118.91 |
| 1 | 20940 | SIX Seattle: MTU 1500 | 206.81.80.113 |
| 2 | 20940 | MegaIX Munich | 194.59.190.59 |
| 3 | 31800 | NL-ix: Main | 193.239.116.162 |
| 4 | 31800 | Equinix Dallas | 206.223.118.88 |
In [63]:
print(f"\nFound {len(unique_addr_df)} unique non-null IP strings to look up.") # unique hop addresses, excluding bogons
if 'lookup_radix4' not in locals():
lookup_radix4 = fetch_caida_data_as_radix()
# perform lookup
print(f"\nPerforming {len(unique_addr_df)} lookups in Radix tree...")
unique_addr_df['hop_asn'] = unique_addr_df['hop_addr'].apply(
lambda ip: lookup_asn(ip, lookup_radix4)
)
### add failed lookups using peeringdb
unique_addr_df['hop_asn'].fillna('-', inplace=True)
unique_addr_df = pd.merge(unique_addr_df, asn_ixp, left_on='hop_addr', right_on='addr', how='left')
traceroutes_df = pd.merge(
traceroutes_df,
unique_addr_df[['hop_addr', 'hop_asn']],
on='hop_addr',
how='left'
)
# unknown ASes have value '-'
traceroutes_df['hop_asn'].fillna('-', inplace=True)
print("Merge complete.")
traceroutes_df.head()
Found 93110 unique non-null IP strings to look up. Retrieving http://data.caida.org/datasets/routing/routeviews-prefix2as/2025/04/routeviews-rv2-20250419-0800.pfx2as.gz ... OK Read 3585254 bytes of compressed data ---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--- Processed 1035774 prefixes Performing 93110 lookups in Radix tree...
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_9821/302428949.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
unique_addr_df['hop_asn'].fillna('-', inplace=True)
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_9821/302428949.py:27: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
traceroutes_df['hop_asn'].fillna('-', inplace=True)
Merge complete.
Out[63]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | ark_to_hop | ark_to_pop | hop_is_bogon | hop_city | hop_lat | hop_lon | hop_country_code | hop_region | hop_sub_region | hop_asn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 125.373137 | 17949.203027 | True | <NA> | NaN | NaN | NaN | NaN | NaN | - |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 151.735215 | 17949.203027 | True | <NA> | NaN | NaN | NaN | NaN | NaN | - |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | -34.6 | -58.38 | Buenos Aires | ... | 1242.185189 | 17949.203027 | False | buenos aires | -34.613152 | -58.377232 | ARG | Americas | Latin America and the Caribbean | 7303 |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 1063.883539 | 17949.203027 | False | buenos aires | -34.613152 | -58.377232 | ARG | Americas | Latin America and the Caribbean | 6762 |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 15021.581900 | 17949.203027 | False | miami | 25.774269 | -80.193657 | USA | Americas | Northern America | 6762 |
5 rows × 23 columns
In [64]:
# how many ASes in the dataset?
traceroutes_df['hop_asn'].nunique()
Out[64]:
1828
In [65]:
traceroutes_df[traceroutes_df['hop_asn'].str.contains('_')].count() # 209k hops are inside MOAses
Out[65]:
tx_hostname 208587 dst 208587 hop_addr 208587 hop_probe_ttl 208587 hop_rtt 208587 ip4_address 164956 hop_name 164956 vp_lat 208587 vp_lon 208587 tx_city 208587 vp_asn 208587 tx_airport 208587 pop_rtt 208587 ark_to_hop 208587 ark_to_pop 208587 hop_is_bogon 208587 hop_city 207478 hop_lat 139297 hop_lon 139297 hop_country_code 139297 hop_region 138624 hop_sub_region 138624 hop_asn 208587 dtype: int64
In [66]:
# add anycast destination ASN
unique_dsts = traceroutes_df['dst'].unique()
asn_lookup_df = pd.DataFrame({
'dst': unique_dsts,
'dst_asn': [lookup_asn(ip, lookup_radix4) for ip in unique_dsts]
})
traceroutes_df = traceroutes_df.merge(asn_lookup_df, on='dst', how='left')
del unique_dsts
del asn_lookup_df
Calculate distances between hops¶
In [67]:
# drop hops with no locations
traceroutes_with_geolocs_df = traceroutes_df[
traceroutes_df['hop_lat'].notna() & traceroutes_df['hop_lon'].notna()
].copy()
# sort by trace and TTL
traceroutes_with_geolocs_df = traceroutes_with_geolocs_df.sort_values(
by=['tx_hostname', 'dst', 'hop_probe_ttl']
)
# for each trace we calculate the inter-hop distances
grouped_geoloc_traces = traceroutes_with_geolocs_df.groupby(['tx_hostname', 'dst'])
# get the lat,lon of the next hop
traceroutes_with_geolocs_df['next_geoloc_hop_lat'] = grouped_geoloc_traces['hop_lat'].shift(-1)
traceroutes_with_geolocs_df['next_geoloc_hop_lon'] = grouped_geoloc_traces['hop_lon'].shift(-1)
# join with all hop data
traceroutes_df = traceroutes_df.join(
traceroutes_with_geolocs_df[['next_geoloc_hop_lat', 'next_geoloc_hop_lon']]
)
# calculate distance between current hop and the next hop with geolocation data
def calculate_distance_to_next_geoloc_hop(row):
current_lat = row['hop_lat']
current_lon = row['hop_lon']
next_geoloc_lat = row['next_geoloc_hop_lat']
next_geoloc_lon = row['next_geoloc_hop_lon']
# Only calculate if the current hop has coordinates AND
# we found a subsequent hop with coordinates for it.
if pd.notna(current_lat) and pd.notna(current_lon) and \
pd.notna(next_geoloc_lat) and pd.notna(next_geoloc_lon):
return haversine(current_lat, current_lon, next_geoloc_lat, next_geoloc_lon)
return np.nan
traceroutes_df['distance_to_next_geoloc_hop_km'] = traceroutes_df.apply(
calculate_distance_to_next_geoloc_hop, axis=1
)
# drop temp columns
columns_to_drop = ['next_geoloc_hop_lat', 'next_geoloc_hop_lon']
traceroutes_df = traceroutes_df.drop(columns=columns_to_drop)
traceroutes_df.head()
Out[67]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | hop_is_bogon | hop_city | hop_lat | hop_lon | hop_country_code | hop_region | hop_sub_region | hop_asn | dst_asn | distance_to_next_geoloc_hop_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | True | <NA> | NaN | NaN | NaN | NaN | NaN | - | 701 | NaN |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | True | <NA> | NaN | NaN | NaN | NaN | NaN | - | 701 | NaN |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | -34.6 | -58.38 | Buenos Aires | ... | False | buenos aires | -34.613152 | -58.377232 | ARG | Americas | Latin America and the Caribbean | 7303 | 701 | 0.000000 |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | False | buenos aires | -34.613152 | -58.377232 | ARG | Americas | Latin America and the Caribbean | 6762 | 701 | 7097.454053 |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | False | miami | 25.774269 | -80.193657 | USA | Americas | Northern America | 6762 | 701 | 0.000000 |
5 rows × 25 columns
Get the inferred location of the anycast site reached (using the penultimate hop's location)¶
In [68]:
# get the p-hops (from which we infer the site reached)
p_hops = traceroutes_df[traceroutes_df['dst'] != traceroutes_df['hop_addr']].sort_values(by='hop_probe_ttl', ascending=False)\
.drop_duplicates(subset=['tx_hostname', 'dst'], keep='first')
p_hops.head()
Out[68]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | hop_is_bogon | hop_city | hop_lat | hop_lon | hop_country_code | hop_region | hop_sub_region | hop_asn | dst_asn | distance_to_next_geoloc_hop_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3810728 | hkg5-cn | 23.11.40.7 | 23.40.189.1 | 43 | 453.065 | 23.40.189.1 | vlan100.r01.tor01.hkg01.fab.netarch.akamai.com | 22.32 | 114.22 | Hong Kong | ... | False | hong kong | 22.320304 | 114.198074 | CHN | Asia | Eastern Asia | 20940 | 33905 | NaN |
| 18836613 | puw-us | 23.11.40.7 | 23.203.145.249 | 43 | 289.470 | 23.203.145.249 | ae34.r03.border101.sea01.fab.netarch.akamai.com | 46.73 | -117.00 | Moscow | ... | False | seattle | 47.449001 | -122.308998 | USA | Americas | Northern America | 20940 | 33905 | NaN |
| 16070816 | bed-us | 23.11.38.17 | 23.198.9.65 | 38 | 340.269 | 23.198.9.65 | vlan100.r05.tor01.bos01.fab.netarch.akamai.com | 42.46 | -71.35 | Concord | ... | False | boston | 42.364300 | -71.005203 | USA | Americas | Northern America | 20940 | 33905 | NaN |
| 2460881 | ind-us | 23.11.38.17 | 154.24.97.54 | 37 | 462.866 | NaN | NaN | 39.77 | -86.16 | Indianapolis | ... | False | indianapolis | 39.768379 | -86.158043 | USA | Americas | Northern America | 174 | 33905 | NaN |
| 19281608 | cpv-br | 23.11.38.17 | 23.32.63.57 | 37 | 590.487 | 23.32.63.57 | ae4.r01.gru01.icn.netarch.akamai.com | -7.23 | -35.88 | Campina Grande | ... | False | sao paulo | -23.435556 | -46.473057 | BRA | Americas | Latin America and the Caribbean | 20940 | 33905 | NaN |
5 rows × 25 columns
In [69]:
# calculate distance between phop and VP coordinates
def calculate_haversine_for_row(row):
# Extract coordinates from the row
vp_coords = (row['vp_lat'], row['vp_lon'])
reached_coords = (row['hop_lat'], row['hop_lon'])
# Check for NaN values in any of the coordinates
if pd.isna(vp_coords[0]) or pd.isna(vp_coords[1]) or \
pd.isna(reached_coords[0]) or pd.isna(reached_coords[1]):
return np.nan # Return NaN if any coordinate is missing
distance = haversine(vp_coords[0], vp_coords[1], reached_coords[0], reached_coords[1])
return distance
p_hops['dist_to_phop'] = p_hops.apply(calculate_haversine_for_row, axis=1)
In [70]:
# how many p-hops do we have?
print(f"Number of p-hops {len(p_hops)}")
Number of p-hops 3349745
In [71]:
# how many have city information
print(f"number of phops with city information {len(p_hops[~p_hops['hop_city'].isna()])}")
print(f"number of phops with no city information {len(p_hops[p_hops['hop_city'].isna()])}")
number of phops with city information 3157078 number of phops with no city information 192667
In [72]:
# merge data into traceroutes_df as reached_city, reached_lon, reached_lat
p_hops_renamed = p_hops[['tx_hostname', 'dst', 'hop_city', 'hop_lat', 'hop_lon', 'dist_to_phop']].rename(columns={
'hop_city': 'reached_city',
'hop_lat': 'reached_lat',
'hop_lon': 'reached_lon',
'dist_to_phop': 'dist_to_phop',
# 'ark_to_phop': 'reached_distance', # inferred distance to reached anycast site (based on RTT)
})
In [73]:
traceroutes_df = traceroutes_df.merge(p_hops_renamed, on=['tx_hostname', 'dst'], how='left')
traceroutes_df.head()
Out[73]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | hop_country_code | hop_region | hop_sub_region | hop_asn | dst_asn | distance_to_next_geoloc_hop_km | reached_city | reached_lat | reached_lon | dist_to_phop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | NaN | NaN | NaN | - | 701 | NaN | miami | 25.774269 | -80.193657 | 7096.002663 |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | NaN | NaN | NaN | - | 701 | NaN | miami | 25.774269 | -80.193657 | 7096.002663 |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | -34.6 | -58.38 | Buenos Aires | ... | ARG | Americas | Latin America and the Caribbean | 7303 | 701 | 0.000000 | miami | 25.774269 | -80.193657 | 7096.002663 |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | ARG | Americas | Latin America and the Caribbean | 6762 | 701 | 7097.454053 | miami | 25.774269 | -80.193657 | 7096.002663 |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | USA | Americas | Northern America | 6762 | 701 | 0.000000 | miami | 25.774269 | -80.193657 | 7096.002663 |
5 rows × 29 columns
Get closest anycast site (according to census data)¶
In [74]:
# add /24-prefix column
traceroutes_df['dst_prefix'] = traceroutes_df['dst'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.0/24')
In [75]:
# load in census locations data https://github.com/anycast-census/anycast-census/tree/main/2025/04/19/2024-04-19_v4_locations.json.gz
url = "https://raw.githubusercontent.com/anycast-census/anycast-census/main/2025/04/19/2025-04-19_v4_locations.json.gz"
response = requests.get(url)
response.raise_for_status()
with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz:
census_locations = pd.read_json(gz, lines=False)
census_locations.head()
Out[75]:
| prefix | count | instances | |
|---|---|---|---|
| 0 | 185.236.105.0/24 | 31 | [{'marker': {'city': 'N/A', 'code_country': 'N... |
| 1 | 104.17.173.0/24 | 70 | [{'marker': {'city': 'Honolulu', 'code_country... |
| 2 | 34.160.213.0/24 | 43 | [{'marker': {'city': 'Madrid', 'code_country':... |
| 3 | 35.227.194.0/24 | 42 | [{'marker': {'city': 'Miami', 'code_country': ... |
| 4 | 146.75.71.0/24 | 22 | [{'marker': {'city': 'Stockholm', 'code_countr... |
In [76]:
# build BallTrees (one for each anycast prefix)
census_ball_trees_data = {}
print("building balltree")
for _, census_row in census_locations.iterrows():
prefix_id = census_row['prefix']
instances_list = census_row['instances']
valid_instance_coords_rad = []
valid_instance_markers = []
for instance_entry in instances_list:
marker = instance_entry['marker']
lat = marker['latitude']
lon = marker['longitude']
# Store the marker data itself
valid_instance_markers.append(marker)
# Convert to radians for BallTree
valid_instance_coords_rad.append([radians(lat), radians(lon)])
# BallTree expects a NumPy array of radian coordinates
coords_array_rad = np.array(valid_instance_coords_rad)
tree = BallTree(coords_array_rad, metric='haversine')
census_ball_trees_data[prefix_id] = {
'ball_tree': tree,
'instance_markers': valid_instance_markers
}
print('finished balltree')
# get unique hostname,dst pairs
unique_routes = traceroutes_df.groupby(['tx_hostname', 'dst_prefix']).agg(
vp_lat=('vp_lat', 'first'),
vp_lon=('vp_lon', 'first')
).reset_index()
print('finding locs')
# find nearest locations using balltrees
results = []
for _, route_row in unique_routes.iterrows():
tx_hostname = route_row['tx_hostname']
dst_prefix = route_row['dst_prefix']
vp_lat = route_row['vp_lat']
vp_lon = route_row['vp_lon']
nearest_census_city = None
min_distance_km = None
nearest_census_lat = None
nearest_census_lon = None
if dst_prefix in census_ball_trees_data:
tree_info = census_ball_trees_data[dst_prefix]
ball_tree = tree_info['ball_tree']
instance_markers_for_prefix = tree_info['instance_markers']
if ball_tree is not None and instance_markers_for_prefix:
query_point_rad = np.array([[radians(vp_lat), radians(vp_lon)]])
# k=1 for the single nearest neighbor
distances_rad, indices = ball_tree.query(query_point_rad, k=1)
nearest_idx = indices[0][0]
angular_distance_rad = distances_rad[0][0]
min_distance_km = angular_distance_rad * EARTH_RADIUS_KM
nearest_marker = instance_markers_for_prefix[nearest_idx]
nearest_census_city = nearest_marker.get('city')
nearest_census_lat = nearest_marker.get('lat')
nearest_census_lon = nearest_marker.get('lon')
results.append({
'tx_hostname': tx_hostname,
'dst_prefix': dst_prefix,
'vp_lat': vp_lat,
'vp_lon': vp_lon,
'nearest_census_city': nearest_census_city,
'dist_nearest_site': min_distance_km,
'nearest_census_lat': nearest_census_lat,
'nearest_census_lon': nearest_census_lon
})
result_df = pd.DataFrame(results)
result_df.head()
building balltree finished balltree finding locs
Out[76]:
| tx_hostname | dst_prefix | vp_lat | vp_lon | nearest_census_city | dist_nearest_site | nearest_census_lat | nearest_census_lon | |
|---|---|---|---|---|---|---|---|---|
| 0 | abz2-uk | 1.1.1.0/24 | 57.15 | -2.09 | Liverpool | 427.081339 | None | None |
| 1 | abz2-uk | 1.10.10.0/24 | 57.15 | -2.09 | Delhi | 6740.474981 | None | None |
| 2 | abz2-uk | 1.12.0.0/24 | 57.15 | -2.09 | Frankfurt-am-Main | 1051.630573 | None | None |
| 3 | abz2-uk | 1.12.12.0/24 | 57.15 | -2.09 | Frankfurt-am-Main | 1051.630573 | None | None |
| 4 | abz2-uk | 1.12.13.0/24 | 57.15 | -2.09 | Frankfurt-am-Main | 1051.630573 | None | None |
In [77]:
# filter on columns we care about
result_df = result_df[[
'tx_hostname',
'dst_prefix',
'nearest_census_city',
'nearest_census_lat',
'nearest_census_lon',
'dist_nearest_site',
]]
In [78]:
# merge results into traceroutes_df
traceroutes_df = traceroutes_df.merge(result_df, on=['tx_hostname', 'dst_prefix'], how='left')
traceroutes_df.head()
Out[78]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | distance_to_next_geoloc_hop_km | reached_city | reached_lat | reached_lon | dist_to_phop | dst_prefix | nearest_census_city | nearest_census_lat | nearest_census_lon | dist_nearest_site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aep3-ar | 100.0.0.1 | 192.168.1.1 | 1 | 1.227 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | NaN | miami | 25.774269 | -80.193657 | 7096.002663 | 100.0.0.0/24 | New York | None | None | 8513.582265 |
| 1 | aep3-ar | 100.0.0.1 | 192.168.0.1 | 2 | 1.485 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | NaN | miami | 25.774269 | -80.193657 | 7096.002663 | 100.0.0.0/24 | New York | None | None | 8513.582265 |
| 2 | aep3-ar | 100.0.0.1 | 181.96.62.70 | 7 | 12.157 | 181.96.62.70 | host70.181-96-62.telecom.net.ar | -34.6 | -58.38 | Buenos Aires | ... | 0.000000 | miami | 25.774269 | -80.193657 | 7096.002663 | 100.0.0.0/24 | New York | None | None | 8513.582265 |
| 3 | aep3-ar | 100.0.0.1 | 195.22.220.58 | 8 | 10.412 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 7097.454053 | miami | 25.774269 | -80.193657 | 7096.002663 | 100.0.0.0/24 | New York | None | None | 8513.582265 |
| 4 | aep3-ar | 100.0.0.1 | 89.221.41.221 | 9 | 147.013 | NaN | NaN | -34.6 | -58.38 | Buenos Aires | ... | 0.000000 | miami | 25.774269 | -80.193657 | 7096.002663 | 100.0.0.0/24 | New York | None | None | 8513.582265 |
5 rows × 34 columns
finding RP candidate links¶
In [2]:
# save progress
load = True # flip to load milestone data
if load:
traceroutes_df = pd.read_csv('./data/milestone.csv.gz', compression='gzip')
# fix type
traceroutes_df['hop_asn'] = traceroutes_df['hop_asn'].astype(str)
traceroutes_df['dst_asn'] = traceroutes_df['dst_asn'].astype(str)
else:
traceroutes_df.to_csv('./data/milestone.csv.gz', compression='gzip', index=False) # 700 MB ish
/var/folders/k_/b2zc8kg571bcqh2l9j8mj82h0000gn/T/ipykernel_52734/2692236368.py:5: DtypeWarning: Columns (10,19,20,21,22,23) have mixed types. Specify dtype option on import or set low_memory=False.
traceroutes_df = pd.read_csv('./data/milestone.csv.gz', compression='gzip')
In [3]:
### get next_hop information for each traceroute
# firstsort such that hops increment
traceroutes_df = traceroutes_df.sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
# get the next values
traceroutes_df['next_ttl'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_probe_ttl'].shift(-1)
traceroutes_df['next_rtt'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_rtt'].shift(-1)
traceroutes_df['next_hop_city'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_city'].shift(-1)
traceroutes_df['next_hop_rtt'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_rtt'].shift(-1)
traceroutes_df['rtt_delta_next_hop'] = traceroutes_df['next_hop_rtt'] - traceroutes_df['hop_rtt']
traceroutes_df['next_hop_is_bogon'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_is_bogon'].shift(-1)
traceroutes_df['next_hop_asn'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_asn'].shift(-1)
traceroutes_df['next_hop_addr'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_addr'].shift(-1)
In [4]:
def parse_as_string_to_set(asn_string):
"""
Examples:
'1114_892' -> {'1114', '892'}
'892' -> {'892'}
'-' -> set()
NaN -> set()
"""
# invalid and NaNs are empty sets
if pd.isna(asn_string) or asn_string == '-':
return set()
# split MOASes and create a set
return set(str(asn_string).split('_'))
In [5]:
# filters
# adjacent hops
is_consecutive = traceroutes_df['next_ttl'] == traceroutes_df['hop_probe_ttl'] + 1
# non-bogons
is_not_bogon = (traceroutes_df['hop_is_bogon'] == False) & (traceroutes_df['next_hop_is_bogon'] == False)
# ignore anycast links
is_not_anycast = ~traceroutes_df['hop_city'].isin(['anycast']) & ~traceroutes_df['next_hop_city'].isin(['anycast'])
# hops in a different city
is_different_city = (traceroutes_df['hop_city'] != traceroutes_df['next_hop_city']) & (traceroutes_df['hop_city'].notna() & traceroutes_df['next_hop_city'].notna())
# (two adjacent ASes that are distinct)
# is_different_as = (traceroutes_df['hop_asn'] != traceroutes_df['next_hop_asn']) & (~traceroutes_df['hop_asn'].isin(['-']) & ~traceroutes_df['next_hop_asn'].isin(['-']))
is_different_as = pd.Series(
[
# isdisjoint() is True if the sets have no common elements
parse_as_string_to_set(hop).isdisjoint(parse_as_string_to_set(next_hop))
for hop, next_hop in zip(
traceroutes_df['hop_asn'],
traceroutes_df['next_hop_asn']
)
],
index=traceroutes_df.index
) & (~traceroutes_df['hop_asn'].isin(['-']) & ~traceroutes_df['next_hop_asn'].isin(['-']))
# check for adjacent hops in the same AS
is_same_as = pd.Series(
[
# isdisjoint() is True if the sets have no common elements
~parse_as_string_to_set(hop).isdisjoint(parse_as_string_to_set(next_hop))
for hop, next_hop in zip(
traceroutes_df['hop_asn'],
traceroutes_df['next_hop_asn']
)
],
index=traceroutes_df.index
) & (~traceroutes_df['hop_asn'].isin(['-']) & ~traceroutes_df['next_hop_asn'].isin(['-']))
In [6]:
def link_statistics(condition):
"""
Examples:
link_statistics(is_different_city) -> get all links connecting hops in different cities
:param condition: the mask to check against
:return: df of traceroute_hops matching the condition and their next hop
"""
first_hops = traceroutes_df.loc[condition]
number_of_traceroutes = first_hops[['tx_hostname', 'dst']].drop_duplicates().shape[0]
print(f"Found {len(first_hops):,} links inside {number_of_traceroutes:,} traceroutes")
# get unique AS links
as_pairs = first_hops[['hop_asn', 'next_hop_asn']].to_numpy()
valid_rows_mask = (as_pairs[:, 0] != '-') & (as_pairs[:, 1] != '-') # filter out AS pairs with unknowns
filtered_as_pairs = as_pairs[valid_rows_mask]
as_pairs_sorted = np.sort(filtered_as_pairs, axis=1) # sort to not count AS1 -> AS2 and AS2 -> AS1 as two distinct links
unique_as_links = set(map(tuple, as_pairs_sorted))
# get unique IP links
ip_pairs = first_hops[['hop_addr', 'next_hop_addr']].to_numpy()
ip_pairs_sorted = np.sort(ip_pairs, axis=1)
unique_ip_links = set(map(tuple, ip_pairs_sorted))
# Print statistics
print("---")
print(f"Number of unique AS links (unordered): {len(unique_as_links):,}")
print(f"Number of unique IP links (unordered): {len(unique_ip_links):,}")
In [7]:
# filter on incrementing TTLs
# a link is two adjacent hops with a single increment in TTL
condition = (
is_consecutive # & is_not_anycast
)
links_indexes = traceroutes_df[condition].index
links_next_indexes = links_indexes + 1 # second link hops
all_links = traceroutes_df.loc[links_indexes.union(links_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 21,061,981 links inside 3,340,490 traceroutes --- Number of unique AS links (unordered): 12,980 Number of unique IP links (unordered): 2,136,351
In [8]:
# filter out hops with either side bogon
condition = (
is_consecutive & is_not_bogon # & is_not_anycast
)
nb_indexes = traceroutes_df[condition].index
nb_next_indexes = nb_indexes + 1 # second link hops
nb_links = traceroutes_df.loc[nb_indexes.union(nb_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 18,141,520 links inside 3,303,531 traceroutes --- Number of unique AS links (unordered): 12,980 Number of unique IP links (unordered): 2,098,527
In [9]:
# distinct city links
condition = (
is_consecutive & is_not_anycast & is_not_bogon & is_different_city
)
dc_indexes = traceroutes_df[condition].index
dc_next_indexes = dc_indexes + 1 # second link hops
dc_links = traceroutes_df.loc[dc_indexes.union(dc_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 4,400,608 links inside 1,989,098 traceroutes --- Number of unique AS links (unordered): 2,680 Number of unique IP links (unordered): 72,645
In [10]:
# distinct city + distinct AS links (candidate RPs)
condition = (
is_consecutive & is_not_anycast & is_not_bogon & is_different_city & is_different_as
)
crp_indexes = traceroutes_df[condition].index
crp_next_indexes = crp_indexes + 1 # second link hops
crp_links = traceroutes_df.loc[crp_indexes.union(crp_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 891,663 links inside 757,942 traceroutes --- Number of unique AS links (unordered): 2,293 Number of unique IP links (unordered): 16,546
In [11]:
# same-AS links
condition = (
is_consecutive & is_not_bogon & is_same_as # & is_not_anycast
)
sas_indexes = traceroutes_df[condition].index
sas_next_indexes = sas_indexes + 1 # second link hops
sas_links = traceroutes_df.loc[sas_indexes.union(sas_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 11,398,325 links inside 3,135,618 traceroutes --- Number of unique AS links (unordered): 856 Number of unique IP links (unordered): 787,833
In [12]:
# same AS different city links
condition = (
is_consecutive & is_not_anycast & is_not_bogon & is_different_city & is_same_as
)
sasdc_indexes = traceroutes_df[condition].index
sasdc_next_indexes = sasdc_indexes + 1 # second link hops
sasdc_links = traceroutes_df.loc[sasdc_next_indexes.union(crp_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 3,055,713 links inside 1,572,018 traceroutes --- Number of unique AS links (unordered): 387 Number of unique IP links (unordered): 45,653
In [14]:
# get AS organisation data
organization_info = []
asn_info = []
with gzip.open('data/caida/20250401.as-org2info.jsonl.gz', 'rt', encoding='utf-8') as gzipped_file:
for line in gzipped_file:
line= json.loads(line)
if line['type'] =='Organization':
if 'name' in line:
name= line['name']
else:
name = 'Unknown'
organization_info.append((line['organizationId'], name, line['country']))
else:
if 'name' in line:
name= line['name']
else:
name = 'Unknown'
asn_info.append((line['asn'],name, line['organizationId']))
organization_df = pd.DataFrame(organization_info, columns=['org_id', 'org_name', 'org_country'])
organization_df.head()
Out[14]:
| org_id | org_name | org_country | |
|---|---|---|---|
| 0 | 1800CO-2-ARIN | 1-800 Contacts, Inc. | US |
| 1 | 1800FL-ARIN | 1-800-Flowers.com, Inc. | US |
| 2 | 1800H-ARIN | 1-800-HOSTING, Inc. | US |
| 3 | 1881CS-ARIN | 1881CS/XPC | US |
| 4 | 1FBU-ARIN | 1st Financial Bank USA | US |
In [15]:
# ASN to Org mapping
asn_df = pd.DataFrame(asn_info, columns=['asn', 'asn_name', 'org_id'])
asn_df.head()
Out[15]:
| asn | asn_name | org_id | |
|---|---|---|---|
| 0 | 1 | LVLT-1 | LPL-141-ARIN |
| 1 | 2 | UDEL-DCN | UNIVER-19-Z-ARIN |
| 2 | 3 | MIT-GATEWAYS | MIT-2-ARIN |
| 3 | 4 | ISI-AS | USC-32-Z-ARIN |
| 4 | 5 | SYMBOLICS | WGL-117-ARIN |
In [16]:
# get the candidate RP links AS mappings (as1 -> as2)
df_selected_final_prova=traceroutes_df.loc[crp_indexes] # weird italian variable name :)
asn_list = list(zip(df_selected_final_prova['hop_asn'], df_selected_final_prova['next_hop_asn']))
asn_flat = [asn for tup in asn_list for asn in tup]
counter = Counter(asn_flat)
In [17]:
# get Organisation data
traceorute_org=[]
for asn, count in counter.items():
orgs=[]
names=[]
asns = set()
if '_' in asn:
temp_set = set(asn.split("_"))
for item in temp_set:
if ',' in item:
for sub_item in item.split(','):
asns.add(sub_item)
else:
asns.add(item)
for asn in asns:
org_id = asn_df.loc[asn_df['asn'] == asn, 'org_id'].values
if len(org_id) > 0:
org_id = org_id[0]
org_name = organization_df.loc[organization_df['org_id'] == org_id, 'org_name'].values
if len(org_name) > 0:
org_name = org_name[0]
else:
org_name = 'Unknown'
else:
org_id = None
org_name = 'Unknown'
orgs.append(org_id)
names.append(org_name)
else:
org_id = asn_df.loc[asn_df['asn'] == asn, 'org_id'].values
if len(org_id) > 0:
org_id = org_id[0]
org_name = organization_df.loc[organization_df['org_id'] == org_id, 'org_name'].values
if len(org_name) > 0:
org_name = org_name[0]
else:
org_name = 'Unknown'
else:
org_id = None
org_name = 'Unknown'
orgs.append(org_id)
names.append(org_name)
asns.add(asn)
traceorute_org.append((asns, orgs, names))
# unique anycast ASes targeted
dst_asn = set(traceroutes_df['dst_asn'].unique())
for asn in dst_asn:
orgs=[]
names=[]
asns = set()
if '_' in asn:
temp_set = set(asn.split("_"))
for item in temp_set:
if ',' in item:
for sub_item in item.split(','):
asns.add(sub_item)
else:
asns.add(item)
for asn in asns:
org_id = asn_df.loc[asn_df['asn'] == asn, 'org_id'].values
if len(org_id) > 0:
org_id = org_id[0]
org_name = organization_df.loc[organization_df['org_id'] == org_id, 'org_name'].values
if len(org_name) > 0:
org_name = org_name[0]
else:
org_name = 'Unknown'
else:
org_id = None
org_name = 'Unknown'
orgs.append(org_id)
names.append(org_name)
else:
org_id = asn_df.loc[asn_df['asn'] == asn, 'org_id'].values
if len(org_id) > 0:
org_id = org_id[0]
org_name = organization_df.loc[organization_df['org_id'] == org_id, 'org_name'].values
if len(org_name) > 0:
org_name = org_name[0]
else:
org_name = 'Unknown'
else:
org_id = None
org_name = 'Unknown'
orgs.append(org_id)
names.append(org_name)
asns.add(asn)
traceorute_org.append((asns, orgs, names))
org_df = pd.DataFrame(traceorute_org, columns=['ASN', 'Org ID', 'Org Name'])
org_df.head()
Out[17]:
| ASN | Org ID | Org Name | |
|---|---|---|---|
| 0 | {34984} | [ORG-BTHA1-RIPE] | [Superonline Iletisim Hizmetleri A.S.] |
| 1 | {31713, 3491} | [ORG-CDCA1-RIPE, BNA-42-ARIN] | [Gateway Communications, PCCW Global, Inc.] |
| 2 | {7578} | [ORG-GNPL1-AP-APNIC] | [GSL Networks Pty LTD] |
| 3 | {6461} | [ZAYOB-ARIN] | [Zayo Bandwidth] |
| 4 | {18119} | [ORG-ACS1-AP-APNIC] | [Advanced Computer Solutions] |
In [18]:
# map AS to organization (an organization may have multiple ASes)
asn_to_org_id_map = {}
for index, row in org_df.iterrows():
if isinstance(row['ASN'], set):
orgs=[]
for asn_val in row['ASN']:
orgs.append(row['Org ID'])
if '_'.join(map(str, row['ASN'])) not in asn_to_org_id_map:
asn_to_org_id_map['_'.join(map(str, row['ASN']))]=set(orgs[0])
else:
if row['ASN'] not in asn_to_org_id_map:
asn_to_org_id_map[row['ASN']] = set(row['Org ID'])
In [19]:
# add hop and anycast dst organization data
traceroutes_df['hop_org'] = traceroutes_df['hop_asn'].map(asn_to_org_id_map)
traceroutes_df['next_hop_org'] = traceroutes_df['next_hop_asn'].map(asn_to_org_id_map)
traceroutes_df['dst_org'] = traceroutes_df['dst_asn'].map(asn_to_org_id_map)
traceroutes_df['hop_org'] = traceroutes_df['hop_org'].apply(lambda x: set() if pd.isna(x) else x)
traceroutes_df['next_hop_org'] = traceroutes_df['next_hop_org'].apply(lambda x: set() if pd.isna(x) else x)
traceroutes_df['dst_org'] = traceroutes_df['dst_org'].apply(lambda x: set() if pd.isna(x) else x)
traceroutes_df.head()
Out[19]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | next_rtt | next_hop_city | next_hop_rtt | rtt_delta_next_hop | next_hop_is_bogon | next_hop_asn | next_hop_addr | hop_org | next_hop_org | dst_org | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1431371 | abz2-uk | 1.1.1.0 | 137.50.19.1 | 1 | 0.301 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | 0.526 | aberdeen | 0.526 | 0.225 | False | 786 | 137.50.0.33 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} |
| 1431372 | abz2-uk | 1.1.1.0 | 137.50.0.33 | 2 | 0.526 | 137.50.0.33 | ew-erg.wireless.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | 0.522 | aberdeen | 0.522 | -0.004 | False | 786 | 137.50.255.225 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} |
| 1431373 | abz2-uk | 1.1.1.0 | 137.50.255.225 | 3 | 0.522 | 137.50.255.225 | ewgw-core-to-mbgw-core-byod.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | 0.417 | aberdeen | 0.417 | -0.105 | False | 786 | 137.50.255.150 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} |
| 1431374 | abz2-uk | 1.1.1.0 | 137.50.255.150 | 4 | 0.417 | 137.50.255.150 | ewbyodfw3-to-ewgw-core.wireless.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | 0.531 | NaN | 0.531 | 0.114 | False | 786 | 146.97.128.9 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} |
| 1431375 | abz2-uk | 1.1.1.0 | 146.97.128.9 | 5 | 0.531 | 146.97.128.9 | xe-0-1-0.aberew-ban1.ja.net | 57.15 | -2.09 | Aberdeen | ... | 3.984 | glasgow | 3.984 | 3.453 | False | 786 | 146.97.37.173 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} |
5 rows × 45 columns
In [20]:
# org filters
# True -> both hops are in the same organization as the anycast org
# False -> hops are in a different organization than the anycast org
org_condition = pd.Series(
[
len(d & h & n) > 0
for d, h, n in zip(
traceroutes_df['dst_org'],
traceroutes_df['hop_org'],
traceroutes_df['next_hop_org']
)
],
index=traceroutes_df.index
)
is_outside_anycast_org = ~org_condition
In [21]:
# step 5. filter out same-org-as-anycast-dst links
condition = (
is_consecutive & is_not_anycast & is_not_bogon & is_different_city & is_different_as & is_outside_anycast_org
)
crp_indexes = traceroutes_df[condition].index
crp_next_indexes = crp_indexes + 1 # second link hops
crp_links = traceroutes_df.loc[crp_indexes.union(crp_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 890,966 links inside 757,559 traceroutes --- Number of unique AS links (unordered): 2,288 Number of unique IP links (unordered): 16,288
In [23]:
# get PoP (IXP facilities) data
asn_per_ixp=None
with open('./data/asn_per_ixp_map_may_2025.json','r') as f:
asn_per_ixp=json.load(f)
In [24]:
ixp_per_asn={}
for ixp in asn_per_ixp:
for asn in asn_per_ixp[ixp]['ixp_members']:
asn=str(asn)
if asn not in ixp_per_asn:
ixp_per_asn[asn] = []
ixp_per_asn[asn].append(ixp)
In [25]:
def get_ixps_as_frozenset(asn_string, ixp_lookup_dict):
"""
Parses a MOAS string and returns a single IMMUTABLE frozenset of all associated IXPs.
"""
if pd.isna(asn_string) or asn_string == '-':
# Return an empty frozenset for consistency
return frozenset()
individual_asns = str(asn_string).split('_')
total_ixps = set()
for asn in individual_asns:
total_ixps.update(ixp_lookup_dict.get(asn, []))
# The only change is here: return an immutable, hashable frozenset
return frozenset(total_ixps)
In [26]:
# get unique ASes
unique_asns = pd.unique(traceroutes_df[['hop_asn', 'next_hop_asn']].values.ravel('K'))
# create mapping of AS to IXPs for all unique ASes (including MOASes)
ixps_map = {
asn_str: get_ixps_as_frozenset(asn_str, ixp_per_asn)
for asn_str in unique_asns
}
# get hop IXP and next hop IXP
traceroutes_df['hop_ixps'] = traceroutes_df['hop_asn'].map(ixps_map)
traceroutes_df['next_hop_ixps'] = traceroutes_df['next_hop_asn'].map(ixps_map)
# clean up misses
traceroutes_df['hop_ixps'] = traceroutes_df['hop_ixps'].apply(
lambda x: x if pd.notna(x) else frozenset()
)
traceroutes_df['next_hop_ixps'] = traceroutes_df['next_hop_ixps'].apply(
lambda x: x if pd.notna(x) else frozenset()
)
In [27]:
# get IXP intersections
pairs_series = pd.Series(
zip(traceroutes_df['hop_ixps'], traceroutes_df['next_hop_ixps']),
index=traceroutes_df.index
)
print("Finding unique pairs and pre-computing intersections...")
unique_pairs = pairs_series.unique()
print(f"Found {len(unique_pairs)} unique IXP pairs to process.")
intersection_map = {
pair: pair[0] & pair[1] for pair in unique_pairs
}
print("Mapping final intersection results...")
traceroutes_df['ixps_intersection'] = pairs_series.map(intersection_map)
traceroutes_df.head()
Finding unique pairs and pre-computing intersections... Found 12497 unique IXP pairs to process. Mapping final intersection results...
Out[27]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | rtt_delta_next_hop | next_hop_is_bogon | next_hop_asn | next_hop_addr | hop_org | next_hop_org | dst_org | hop_ixps | next_hop_ixps | ixps_intersection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1431371 | abz2-uk | 1.1.1.0 | 137.50.19.1 | 1 | 0.301 | 137.50.19.1 | milliways2-sbx-one-ipv4.erg.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | 0.225 | False | 786 | 137.50.0.33 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} | (583, 18, 321, 435) | (583, 18, 321, 435) | (583, 18, 321, 435) |
| 1431372 | abz2-uk | 1.1.1.0 | 137.50.0.33 | 2 | 0.526 | 137.50.0.33 | ew-erg.wireless.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | -0.004 | False | 786 | 137.50.255.225 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} | (583, 18, 321, 435) | (583, 18, 321, 435) | (583, 18, 321, 435) |
| 1431373 | abz2-uk | 1.1.1.0 | 137.50.255.225 | 3 | 0.522 | 137.50.255.225 | ewgw-core-to-mbgw-core-byod.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | -0.105 | False | 786 | 137.50.255.150 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} | (583, 18, 321, 435) | (583, 18, 321, 435) | (583, 18, 321, 435) |
| 1431374 | abz2-uk | 1.1.1.0 | 137.50.255.150 | 4 | 0.417 | 137.50.255.150 | ewbyodfw3-to-ewgw-core.wireless.abdn.ac.uk | 57.15 | -2.09 | Aberdeen | ... | 0.114 | False | 786 | 146.97.128.9 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} | (583, 18, 321, 435) | (583, 18, 321, 435) | (583, 18, 321, 435) |
| 1431375 | abz2-uk | 1.1.1.0 | 146.97.128.9 | 5 | 0.531 | 146.97.128.9 | xe-0-1-0.aberew-ban1.ja.net | 57.15 | -2.09 | Aberdeen | ... | 3.453 | False | 786 | 146.97.37.173 | {ORG-UA23-RIPE} | {ORG-UA23-RIPE} | {CLOUD14-ARIN} | (583, 18, 321, 435) | (583, 18, 321, 435) | (583, 18, 321, 435) |
5 rows × 48 columns
In [28]:
# IXP filter
# if IXP intersection exists -> the link is possibly inside an IXP
is_inside_ixp = traceroutes_df['ixps_intersection'].apply(lambda x: len(x) > 0)
is_outside_ixp = traceroutes_df['ixps_intersection'].apply(lambda x: len(x) == 0)
In [29]:
# distinct city + distinct AS links (candidate RPs) + connected with an IXP
condition = (
is_consecutive & is_not_anycast & is_not_bogon & is_different_city & is_different_as & is_outside_anycast_org & is_inside_ixp
)
fcrp_indexes = traceroutes_df[condition].index
fcrp_next_indexes = fcrp_indexes + 1 # second link hops
fcrp_links = traceroutes_df.loc[fcrp_indexes.union(fcrp_next_indexes)].sort_values(by=['tx_hostname', 'dst', 'hop_probe_ttl'])
link_statistics(condition)
Found 347,485 links inside 322,943 traceroutes --- Number of unique AS links (unordered): 1,220 Number of unique IP links (unordered): 6,995
plotting¶
In [30]:
def get_cdf_data(series):
"""
Takes a pandas Series, sorts it, and returns x and y values for a CDF plot.
"""
if series.empty:
return np.array([]), np.array([])
# Sort the data
data_sorted = np.sort(series)
# Calculate the CDF y-values
p = np.arange(1, len(data_sorted) + 1) / len(data_sorted)
return data_sorted, p
def downsample_cdf(x, y, num_points=1000):
"""
Downsamples CDF to `num_points` evenly spaced percentiles (between 0 and 1).
Assumes `x` is sorted and `y` is the cumulative probability.
"""
x = np.array(x)
y = np.array(y)
# Create target y-values (percentiles)
y_target = np.linspace(0, 1, num_points)
# Interpolate to find corresponding x-values
x_downsampled = np.interp(y_target, y, x)
return x_downsampled, y_target
def thousands_formatter(x, pos):
return f'{int(x):,}'
In [31]:
# indexes of links that are not candidate RP
non_fcrp_indexes = links_indexes.difference(fcrp_indexes)
In [32]:
sample_size = 2_000
cb_safe_colors = {
'blue': '#4477AA',
'green': '#228833',
'red': '#EE6677',
'orange': '#FFA500',
}
indexes_to_plot = {
"cRP": (fcrp_indexes, cb_safe_colors['red'], 'o'),
"non-cRP": (non_fcrp_indexes, cb_safe_colors['blue'], 'o'),
}
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
# plot rtt deltas (left subplot)
y_text_offset_1 = 0.05 # text label offset
for label, (idx, color, marker) in indexes_to_plot.items():
latencies = traceroutes_df.loc[idx]['rtt_delta_next_hop'].dropna()
# latencies = latencies[latencies > 0]
x_full, y_full = get_cdf_data(latencies)
x_down, y_down = downsample_cdf(x_full, y_full, num_points=sample_size)
ax1.plot(x_down, y_down, marker=marker, linestyle='none', label=label, markersize=4, color=color)
# statistics
mean_latency = np.mean(latencies)
p75_latency = latencies.quantile(0.75)
p95_latency = latencies.quantile(0.95)
# ax1.axvline(x=p75_latency, color=color, linestyle=':')
# ax1.axvline(x=p95_latency, color=color, linestyle=':')
ax1.text(0.95, y_text_offset_1, f'95th Pctl ({label}): {p95_latency:.2f} ms',
transform=ax1.transAxes, color=color, fontsize=20,
ha='right', va='bottom')
y_text_offset_1 += 0.07
ax1.text(0.95, y_text_offset_1, f'75th Pctl ({label}): {p75_latency:.2f} ms',
transform=ax1.transAxes, color=color, fontsize=20,
ha='right', va='bottom')
y_text_offset_1 += 0.07
ax1.text(0.95, y_text_offset_1, f'Mean ({label}): {mean_latency:.2f} ms',
transform=ax1.transAxes, color=color, fontsize=20,
ha='right', va='bottom')
y_text_offset_1 += 0.07
# Add a little extra space before the next group of text
y_text_offset_1 += 0.03
ax1.set_xlabel("RTT Delta to Next Hop (ms)")
ax1.set_ylabel("CDF")
# ax1.set_title("CDF of RTT Delta to Next Hop", fontsize=16)
ax1.grid(True, which="both", linestyle='--', linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.MultipleLocator(10))
ax1.set_xlim(left=0, right=100)
ax1.yaxis.set_major_locator(mticker.MultipleLocator(0.1))
# ax1.set_xscale('log')
# plot distances (right subplot)
y_text_offset_2 = 0.05 # text label offset
for label, (idx, color, marker) in indexes_to_plot.items():
distances = traceroutes_df.loc[idx]['distance_to_next_geoloc_hop_km'].dropna()
x_full, y_full = get_cdf_data(distances)
x_down, y_down = downsample_cdf(x_full, y_full, num_points=sample_size)
ax2.plot(x_down, y_down, marker=marker, linestyle='none', label=label, markersize=4, color=color)
# statistics
mean_distance = int(np.mean(distances))
p75_distance = int(distances.quantile(0.75))
p95_distance = int(distances.quantile(0.95))
# ax2.axvline(x=p75_distance, color=color, linestyle=':')
# ax2.axvline(x=p95_distance, color=color, linestyle=':')
ax2.text(0.95, y_text_offset_2, f'95th Pctl ({label}): {p95_distance:,} km',
transform=ax2.transAxes, color=color, fontsize=20,
ha='right', va='bottom')
y_text_offset_2 += 0.07
ax2.text(0.95, y_text_offset_2, f'75th Pctl ({label}): {p75_distance:,} km',
transform=ax2.transAxes, color=color, fontsize=20,
ha='right', va='bottom')
y_text_offset_2 += 0.07
ax2.text(0.95, y_text_offset_2, f'Mean ({label}): {mean_distance:,} km',
transform=ax2.transAxes, color=color, fontsize=20,
ha='right', va='bottom')
y_text_offset_2 += 0.07
# Add a little extra space before the next group of text
y_text_offset_2 += 0.03
ax2.set_xlabel("Distance to next hop (km)")
ax2.set_ylabel("CDF")
ax2.xaxis.set_major_formatter(mticker.FuncFormatter(thousands_formatter))
ax2.yaxis.set_major_locator(mticker.MultipleLocator(0.1))
# ax2.set_title("CDF of Distance to Next Hop", fontsize=16)
ax2.grid(True, which="both", linestyle='--', linewidth=0.5)
ax2.xaxis.set_major_locator(mticker.MultipleLocator(2_000))
ax2.set_xlim(left=0, right=10_000)
# ax2.set_xscale('log')
# tilt labels
plt.setp(ax1.get_xticklabels(), rotation=30, ha="right")
plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")
# shared legend for the line types
# handles, labels = ax1.get_legend_handles_labels()
# fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=2, markerscale=5)
# adjust layout to prevent titles/labels from overlapping and make room for the legend
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.savefig("./plots/cdf_comparison.pdf", bbox_inches='tight')
plt.show()
In [33]:
# classify traceroutes with and without candidate RP links
# get the traceroute identifier keys (tx_hostname, dst) pairs
fcrp_keys_index = pd.MultiIndex.from_frame(
fcrp_links[['tx_hostname', 'dst']].drop_duplicates()
)
# get the indices for each traceroute identifier
traceroutes_keys_index = pd.MultiIndex.from_frame(
traceroutes_df[['tx_hostname', 'dst']]
)
# check whether the traceroute identifier is part of the candidate RP set
traceroutes_df['is_fcrp'] = traceroutes_keys_index.isin(fcrp_keys_index)
In [34]:
def calculate_delta_cdf(df, col1, col2, downsample=False):
"""
Calculates the CDF for the delta between two columns in a DataFrame.
Args:
df (pd.DataFrame): The input DataFrame.
col1 (str): The name of the first column for the delta (e.g., 'dist_to_phop').
col2 (str): The name of the second column for the delta (e.g., 'dist_nearest_site').
downsample (bool): If True, downsamples the CDF data for cleaner plotting.
Returns:
tuple: A tuple containing (x_cdf, y_cdf) for the delta's CDF.
Returns (np.array([]), np.array([])) if the input DataFrame is empty.
"""
# get the delta
delta = df[col1] - df[col2]
# print optimal and sub-optimal traceroutes
print(f"Optimal traceroutes {delta[delta <= 100].count()}")
print(f"Sub-optimal traceroutes {delta[delta > 100].count()}")
# compute CDF
x, y = get_cdf_data(delta)
# downsample
if downsample:
x, y = downsample_cdf(x, y)
return x, y
In [35]:
# get the CDF sub-optimal/optimal routing metrics for all, CRP, and non-CRP traceroutes
# all traceroutes
all_links_firsts = traceroutes_df.groupby(['tx_hostname', 'dst_prefix']).first()
print(f"Found {len(all_links_firsts)} unique 'all' traceroutes.")
# traceroutes with candidate RPs
fcrp_links_firsts = fcrp_links.groupby(['tx_hostname', 'dst_prefix']).first()
print(f"Found {len(fcrp_links_firsts)} unique 'FCR-P' traceroutes.")
# traceroutes without candidate RPs
non_fcrp_links_firsts = traceroutes_df[~traceroutes_df['is_fcrp']].groupby(['tx_hostname', 'dst_prefix']).first()
print(f"Found {len(non_fcrp_links_firsts)} unique 'non-FCR-P' traceroutes.")
print("\nCalculating CDFs...")
print('All traceorutes')
x_all, y_all = calculate_delta_cdf(all_links_firsts, 'dist_to_phop', 'dist_nearest_site')
print('Candidate RP traceroutes')
x_fcrp, y_fcrp = calculate_delta_cdf(fcrp_links_firsts, 'dist_to_phop', 'dist_nearest_site')
print('non RP traceroutes')
x_non_fcrp, y_non_fcrp = calculate_delta_cdf(non_fcrp_links_firsts, 'dist_to_phop', 'dist_nearest_site')
Found 3353799 unique 'all' traceroutes. Found 322943 unique 'FCR-P' traceroutes. Found 3030856 unique 'non-FCR-P' traceroutes. Calculating CDFs... All traceorutes Optimal traceroutes 2302489 Sub-optimal traceroutes 854589 Candidate RP traceroutes Optimal traceroutes 204865 Sub-optimal traceroutes 105632 non RP traceroutes Optimal traceroutes 2097624 Sub-optimal traceroutes 748957
In [36]:
plot_data = {
'All traceroutes': {
'x': x_all,
'y': y_all,
'color': 'blue',
'linestyle': '-'
},
'cRP traceroutes': {
'x': x_fcrp,
'y': y_fcrp,
'color': 'green',
'linestyle': '--'
},
'Non-cRP traceroutes': {
'x': x_non_fcrp,
'y': y_non_fcrp,
'color': 'purple',
'linestyle': ':'
}
}
plt.figure(figsize=(12, 8))
# Loop through the organized data and plot each CDF
for label, data in plot_data.items():
plt.plot(data['x'], data['y'],
label=label,
color=data['color'],
linestyle=data['linestyle'],
drawstyle='steps-post',
linewidth=2,
)
plt.xlabel("Delta between p-hop and nearest anycast site (km)", fontsize=20)
plt.ylabel("CDF (Cumulative Probability)", fontsize=20)
plt.legend(loc='best', fontsize=25)
plt.grid(True, linestyle='--', alpha=0.6)
plt.xlim(left=1) # Start at 1km to avoid log(0) issues
plt.xscale('log')
plt.yticks(np.arange(0, 1.05, 0.10))
plt.ylim(bottom=0)
# # sub-optimal threshold
# plt.axvline(x=100, color='red', linestyle='--', linewidth=1.5, alpha=0.9)
# # Text annotation for the vertical line
# plt.text(100, 0.5, '100 km Delta',
# ha='right', va='center', rotation=90, color='red',
# bbox=dict(facecolor='white', edgecolor='red', boxstyle='round,pad=0.3'))
# plt.text(60, 0.4, 'Optimal site reached', ha='right', va='center', rotation=0, color='red')
# plt.text(120, 0.6, 'Sub-optimal site reached', ha='left', va='center', rotation=0, color='red')
plt.ylim(bottom=0.3)
plt.tight_layout()
plt.savefig('./plots/comparison_cdf_distances.pdf', bbox_inches='tight', pad_inches=0.1)
plt.show()
In [39]:
grouped_traceroutes = traceroutes_df.groupby(['tx_hostname', 'dst'])
# grouped_traceroutes = traceroutes_df[traceroutes_df['is_fcrp']].groupby(['tx_hostname', 'dst'])
# grouped_traceroutes = traceroutes_df[~traceroutes_df['is_fcrp']].groupby(['tx_hostname', 'dst'])
summary_df = grouped_traceroutes.agg(
destination_rtt=('pop_rtt', 'first'),
distance_to_phop=('dist_to_phop', 'first'),
inter_hop_distance_sum=('distance_to_next_geoloc_hop_km', 'sum'), # Renamed for clarity
encounters_rp=('is_fcrp', 'first'),
# get distance from VP to first geolocated hop (to include it in the sum)
vp_lat=('vp_lat', 'first'),
vp_lon=('vp_lon', 'first'),
first_hop_lat=('hop_lat', 'first'), # 'first' will get the first non-NaN hop lat
first_hop_lon=('hop_lon', 'first'),
).reset_index()
# calculate distance from VP to first hop
summary_df['dist_vp_to_first_hop'] = summary_df.apply(
lambda row: haversine(
row['vp_lat'], row['vp_lon'],
row['first_hop_lat'], row['first_hop_lon']
),
axis=1
)
# add
summary_df['path_length_km'] = (
summary_df['inter_hop_distance_sum'] + summary_df['dist_vp_to_first_hop'].fillna(0)
)
summary_df.drop(columns=[
'inter_hop_distance_sum',
'dist_vp_to_first_hop',
'vp_lat', 'vp_lon',
'first_hop_lat', 'first_hop_lon'
], inplace=True)
summary_df = summary_df.reset_index()
summary_df = summary_df.rename(columns={
'destination_rtt': 'pop_rtt_ms',
'distance_to_phop': 'dist_to_phop_km',
'total_distance_traveled_km': 'path_length_km'
})
summary_df.head()
Out[39]:
| index | tx_hostname | dst | pop_rtt_ms | dist_to_phop_km | encounters_rp | path_length_km | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | abz2-uk | 1.1.1.0 | 13.729 | 408.086189 | False | 490.381207 |
| 1 | 1 | abz2-uk | 1.10.10.10 | 132.708 | 8329.185570 | False | 9024.041845 |
| 2 | 2 | abz2-uk | 1.12.0.1 | 25.568 | 1054.217133 | False | 1328.295086 |
| 3 | 3 | abz2-uk | 1.12.12.12 | 25.344 | 1054.217133 | False | 1328.295086 |
| 4 | 4 | abz2-uk | 1.12.13.50 | 25.461 | 1054.217133 | False | 1328.295086 |
In [40]:
# encountering RP
len(summary_df[summary_df['encounters_rp']])
Out[40]:
322943
In [41]:
# not encounter RP
len(summary_df[~summary_df['encounters_rp']])
Out[41]:
3030856
In [42]:
summary_df['pop_rtt_s'] = summary_df['pop_rtt_ms'] / 1000.0
summary_df['max_dist_km'] = (SPEED_IN_FIBRE * summary_df['pop_rtt_s']) / 2.0
In [43]:
summary_df = summary_df[['tx_hostname', 'dst', 'max_dist_km', 'dist_to_phop_km', 'path_length_km', 'encounters_rp']]
In [44]:
summary_df.head()
Out[44]:
| tx_hostname | dst | max_dist_km | dist_to_phop_km | path_length_km | encounters_rp | |
|---|---|---|---|---|---|---|
| 0 | abz2-uk | 1.1.1.0 | 1402.809941 | 408.086189 | 490.381207 | False |
| 1 | abz2-uk | 1.10.10.10 | 13559.917088 | 8329.185570 | 9024.041845 | False |
| 2 | abz2-uk | 1.12.0.1 | 2612.502337 | 1054.217133 | 1328.295086 | False |
| 3 | abz2-uk | 1.12.12.12 | 2589.614331 | 1054.217133 | 1328.295086 | False |
| 4 | abz2-uk | 1.12.13.50 | 2601.569227 | 1054.217133 | 1328.295086 | False |
In [45]:
# show top 10 highest path_length_km
summary_df.sort_values(by='path_length_km', ascending=False).head(10)
Out[45]:
| tx_hostname | dst | max_dist_km | dist_to_phop_km | path_length_km | encounters_rp | |
|---|---|---|---|---|---|---|
| 2077060 | mru-mu | 104.44.14.118 | 35510.944818 | 5913.311360 | 86380.306706 | False |
| 3151758 | tnr-mg | 104.44.14.118 | 35146.882481 | 6918.896709 | 85411.988127 | False |
| 818740 | dar-tz | 13.107.52.1 | 29581.623084 | 9559.878490 | 67850.370308 | False |
| 2732408 | san9-us | 104.44.15.107 | 17136.883190 | 9367.696299 | 67802.376612 | True |
| 2625319 | ral-us | 104.44.15.107 | 19827.552153 | 9267.672184 | 67386.853147 | True |
| 2682840 | san-us | 104.44.15.107 | 17355.749742 | 9377.790495 | 67300.572806 | True |
| 2999514 | sna-us | 104.44.15.107 | 20023.019806 | 9322.615490 | 67127.098487 | True |
| 1085612 | ful-us | 104.44.15.107 | 19183.111749 | 9303.171070 | 67114.993131 | True |
| 33682 | acv-us | 104.44.15.107 | 19114.345554 | 8903.647097 | 67074.280371 | True |
| 2077333 | mru-mu | 13.107.52.1 | 27512.608698 | 8557.610569 | 67058.055931 | False |
In [46]:
traceroutes_df[(traceroutes_df['dst'] == '104.44.14.118') & (traceroutes_df['tx_hostname'] == 'mru-mu')]
Out[46]:
| tx_hostname | dst | hop_addr | hop_probe_ttl | hop_rtt | ip4_address | hop_name | vp_lat | vp_lon | tx_city | ... | next_hop_is_bogon | next_hop_asn | next_hop_addr | hop_org | next_hop_org | dst_org | hop_ixps | next_hop_ixps | ixps_intersection | is_fcrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 466985 | mru-mu | 104.44.14.118 | 196.49.9.51 | 1 | 0.505 | 196.49.9.51 | 53-fa0-1.br02.mixp.org | -20.24 | 57.49 | Ebene | ... | False | 36868 | 196.46.40.57 | {} | {} | {MSFT-ARIN} | (1508) | (18, 344, 592, 129, 597, 1508) | (1508) | False |
| 466986 | mru-mu | 104.44.14.118 | 196.46.40.57 | 2 | 0.607 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 36868 | 196.46.34.109 | {} | {} | {MSFT-ARIN} | (18, 344, 592, 129, 597, 1508) | (18, 344, 592, 129, 597, 1508) | (18, 344, 592, 129, 597, 1508) | False |
| 466987 | mru-mu | 104.44.14.118 | 196.46.34.109 | 3 | 41.234 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | - | 196.60.8.133 | {} | {None} | {MSFT-ARIN} | (18, 344, 592, 129, 597, 1508) | () | () | False |
| 466988 | mru-mu | 104.44.14.118 | 196.60.8.133 | 4 | 41.049 | 196.60.8.133 | microsoft.ixp.joburg | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.50.82 | {None} | {MSFT-ARIN} | {MSFT-ARIN} | () | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | () | False |
| 466989 | mru-mu | 104.44.14.118 | 104.44.50.82 | 5 | 41.258 | 104.44.50.82 | ae24-0.icr01.jnb20.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.20.65 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466990 | mru-mu | 104.44.14.118 | 104.44.20.65 | 6 | 268.644 | 104.44.20.65 | be-100-0.ibr01.jnb20.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.19.171 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466991 | mru-mu | 104.44.14.118 | 51.10.19.171 | 7 | 274.991 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.19.178 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466992 | mru-mu | 104.44.14.118 | 51.10.19.178 | 8 | 273.530 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.4.53 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466993 | mru-mu | 104.44.14.118 | 51.10.4.53 | 9 | 273.800 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.16.20 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466994 | mru-mu | 104.44.14.118 | 104.44.16.20 | 10 | 274.080 | 104.44.16.20 | be-1-0.ibr02.auh20.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.4.54 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466995 | mru-mu | 104.44.14.118 | 51.10.4.54 | 11 | 273.646 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.1.180 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466996 | mru-mu | 104.44.14.118 | 51.10.1.180 | 12 | 269.285 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.9.255 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466997 | mru-mu | 104.44.14.118 | 51.10.9.255 | 13 | 274.267 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.55.64 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466998 | mru-mu | 104.44.14.118 | 104.44.55.64 | 14 | 295.183 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.33.16 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 466999 | mru-mu | 104.44.14.118 | 51.10.33.16 | 15 | 269.041 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.36.17 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467000 | mru-mu | 104.44.14.118 | 51.10.36.17 | 16 | 273.466 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.19.130 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467001 | mru-mu | 104.44.14.118 | 104.44.19.130 | 17 | 273.895 | 104.44.19.130 | be-6-0.ibr01.pnq20.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.19.209 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467002 | mru-mu | 104.44.14.118 | 104.44.19.209 | 18 | 273.190 | 104.44.19.209 | be-10-0.ibr01.yto20.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 51.10.9.118 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467003 | mru-mu | 104.44.14.118 | 51.10.9.118 | 19 | 268.620 | NaN | NaN | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.19.141 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467004 | mru-mu | 104.44.14.118 | 104.44.19.141 | 20 | 272.963 | 104.44.19.141 | be-13-0.ibr02.per01.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.32.148 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467005 | mru-mu | 104.44.14.118 | 104.44.32.148 | 21 | 273.387 | 104.44.32.148 | ae25-0.ear01.per01.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.234.32 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467006 | mru-mu | 104.44.14.118 | 104.44.234.32 | 22 | 326.804 | 104.44.234.32 | ae20-0.ier01.per30.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | False | 8075 | 104.44.14.118 | {MSFT-ARIN} | {MSFT-ARIN} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | False |
| 467007 | mru-mu | 104.44.14.118 | 104.44.14.118 | 23 | 347.538 | 104.44.14.118 | ae69-0.ier01.sjc.ntwk.msn.net | -20.24 | 57.49 | Ebene | ... | NaN | NaN | NaN | {MSFT-ARIN} | {} | {MSFT-ARIN} | (63, 64, 167, 250, 713, 1419, 98, 236, 26, 355... | () | () | False |
23 rows × 49 columns
In [47]:
summary_df.describe()
Out[47]:
| max_dist_km | dist_to_phop_km | path_length_km | |
|---|---|---|---|
| count | 3.353799e+06 | 3.157078e+06 | 3.353799e+06 |
| mean | 3.870442e+03 | 1.726087e+03 | 2.094826e+03 |
| std | 7.418802e+03 | 3.267059e+03 | 4.147670e+03 |
| min | 1.532679e+00 | 9.914615e-02 | 0.000000e+00 |
| 25% | 2.516659e+02 | 1.736144e+01 | 3.467871e+01 |
| 50% | 1.193344e+03 | 3.054161e+02 | 4.122981e+02 |
| 75% | 3.873539e+03 | 1.450546e+03 | 1.649152e+03 |
| max | 1.216326e+06 | 1.991397e+04 | 8.638031e+04 |
In [48]:
destination_dist_col = 'dist_to_phop_km'
summary_df['detour_ratio'] = summary_df['path_length_km'] / summary_df[destination_dist_col].replace(0, np.nan)
summary_df.dropna(subset=['detour_ratio'], inplace=True)
# sample size
# n_per_group = 25_000
n_per_group = 5_000
df_with_rp = summary_df[summary_df['encounters_rp'] == True]
df_without_rp = summary_df[summary_df['encounters_rp'] == False]
n_sample_with_rp = min(n_per_group, len(df_with_rp))
n_sample_without_rp = min(n_per_group, len(df_without_rp))
if n_sample_with_rp < n_per_group:
print(f"Warning: Only found {n_sample_with_rp} data points with 'encounters_rp=True', less than the desired {n_per_group}.")
if n_sample_without_rp < n_per_group:
print(f"Warning: Only found {n_sample_without_rp} data points with 'encounters_rp=False', less than the desired {n_per_group}.")
# Take the random samples
sample_with_rp = df_with_rp.sample(n=n_sample_with_rp, random_state=42)
sample_without_rp = df_without_rp.sample(n=n_sample_without_rp, random_state=42)
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(
sample_without_rp[destination_dist_col],
sample_without_rp['path_length_km'],
color='#E69F00',
s=50,
alpha=0.9,
marker='x',
label='non-cRP traceroutes',
zorder=3
)
ax.scatter(
sample_with_rp[destination_dist_col],
sample_with_rp['path_length_km'],
s=120,
facecolors='none',
edgecolors='#56B4E9',
linewidth=1.0,
label='cRP traceroutes',
zorder=2
)
combined_sample = pd.concat([sample_with_rp, sample_without_rp])
lim_min = min(combined_sample[destination_dist_col].min(), combined_sample['path_length_km'].min())
lim_max = max(combined_sample[destination_dist_col].quantile(1.0), combined_sample['path_length_km'].quantile(0.95))
# plt.plot([lim_min, lim_max], [lim_min, lim_max], 'r--', label='Path length == Distance to p-hop')
x_range = [lim_min, lim_max]
plt.plot(x_range, [x for x in x_range],
color='green',
linestyle='-',
label='Detour Ratio = 1')
plt.plot(x_range, [2 * x for x in x_range],
color='orange',
linestyle='-',
label='Detour Ratio = 2')
plt.plot(x_range, [4 * x for x in x_range],
color='red',
linestyle='-',
label='Detour Ratio = 4')
plt.xlabel(f'Distance to p-hop (km)')
plt.ylabel('Inferred path length (km)')
plt.ylim(top=50_000)
### legend
# plt.legend(loc='lower right')
all_handles, all_labels = ax.get_legend_handles_labels()
marker_handles = []
marker_labels = []
line_handles = []
line_labels = []
for handle, label in zip(all_handles, all_labels):
if 'traceroute' in label:
marker_handles.append(handle)
marker_labels.append(label)
elif 'Detour Ratio' in label:
line_handles.append(handle)
line_labels.append(label)
legend1 = ax.legend(line_handles, line_labels, loc='lower right')
ax.add_artist(legend1)
legend2 = ax.legend(marker_handles, marker_labels, loc='upper left', markerscale=1.5)
plt.grid(True, linestyle='--', alpha=0.7)
ax.xaxis.set_major_formatter(mticker.FuncFormatter(thousands_formatter))
ax.yaxis.set_major_formatter(mticker.FuncFormatter(thousands_formatter))
plt.xlim(left=1)
plt.ylim(bottom=1)
# save fig
plt.savefig('./plots/path_length_vs_distance_to_phop_stratified.pdf', bbox_inches='tight', pad_inches=0)
plt.show()
In [49]:
summary_df[summary_df['detour_ratio'] > 4]
Out[49]:
| tx_hostname | dst | max_dist_km | dist_to_phop_km | path_length_km | encounters_rp | detour_ratio | |
|---|---|---|---|---|---|---|---|
| 3834 | abz2-uk | 154.6.238.1 | 6165.558648 | 693.950592 | 4559.365373 | False | 6.570158 |
| 5584 | abz2-uk | 186.192.80.2 | 22426.056022 | 1199.571949 | 16152.664958 | False | 13.465357 |
| 5585 | abz2-uk | 186.192.81.2 | 22356.370219 | 1199.571949 | 16152.664958 | False | 13.465357 |
| 5586 | abz2-uk | 186.192.83.2 | 22392.132728 | 1199.571949 | 16152.664958 | False | 13.465357 |
| 5587 | abz2-uk | 186.192.89.1 | 22439.441418 | 1199.571949 | 16152.664958 | False | 13.465357 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 3344510 | zrh3-ch | 96.43.100.0 | 86.749628 | 1.662681 | 45.265285 | False | 27.224272 |
| 3344511 | zrh3-ch | 96.43.101.1 | 85.114770 | 1.662681 | 45.265285 | False | 27.224272 |
| 3344517 | zrh3-ch | 96.7.136.0 | 81.231984 | 5.955048 | 30.714144 | False | 5.157666 |
| 3344518 | zrh3-ch | 96.7.137.0 | 79.494948 | 5.955048 | 30.714144 | False | 5.157666 |
| 3344545 | zrh3-ch | 98.98.157.0 | 140.904284 | 1.662681 | 37.964305 | False | 22.833183 |
232047 rows × 7 columns
In [50]:
print(f"Found {len(summary_df[summary_df['detour_ratio'] > 4])} traceroutes with a detour ratio higher than 4 of which {len(summary_df[(summary_df['detour_ratio'] > 4) & (summary_df['encounters_rp'])])} encounter RP")
Found 232047 traceroutes with a detour ratio higher than 4 of which 39775 encounter RP
In [51]:
df_rp = summary_df[summary_df['encounters_rp'] == True]
df_no_rp = summary_df[summary_df['encounters_rp'] == False]
total_rp = len(df_rp)
total_no_rp = len(df_no_rp)
thresholds = range(2, 11)
results_data = []
for t in thresholds:
# Count how many in each group exceed the current threshold
count_rp_gt_t = len(df_rp[df_rp['detour_ratio'] > t])
count_no_rp_gt_t = len(df_no_rp[df_no_rp['detour_ratio'] > t])
# Calculate the percentage for each group
percent_rp = (count_rp_gt_t / total_rp) * 100
percent_no_rp = (count_no_rp_gt_t / total_no_rp) * 100
# Store the results for this threshold
results_data.append({
'Detour ratio': f'> {t}',
'% of Non-cRP': percent_no_rp,
'% of cRP': percent_rp
})
results_df = pd.DataFrame(results_data)
results_df.set_index('Detour ratio', inplace=True)
results_df['% of Non-cRP'] = results_df['% of Non-cRP'].map('{:.2f}%'.format)
results_df['% of cRP'] = results_df['% of cRP'].map('{:.2f}%'.format)
results_df
Out[51]:
| % of Non-cRP | % of cRP | |
|---|---|---|
| Detour ratio | ||
| > 2 | 13.70% | 30.74% |
| > 3 | 8.94% | 18.65% |
| > 4 | 6.75% | 12.81% |
| > 5 | 5.93% | 10.96% |
| > 6 | 5.51% | 10.17% |
| > 7 | 4.65% | 8.07% |
| > 8 | 4.51% | 7.25% |
| > 9 | 4.27% | 7.20% |
| > 10 | 4.24% | 7.18% |
In [52]:
# show top 10 highest detours
summary_df.sort_values(by='detour_ratio', ascending=False).head(10)
Out[52]:
| tx_hostname | dst | max_dist_km | dist_to_phop_km | path_length_km | encounters_rp | detour_ratio | |
|---|---|---|---|---|---|---|---|
| 2202879 | nrt-jp | 103.35.126.201 | 28281.809167 | 0.164298 | 39714.672062 | True | 241723.819010 |
| 1264628 | hnd-jp | 103.35.126.201 | 28527.855226 | 0.164298 | 39361.733714 | True | 239575.655601 |
| 2222345 | nrt3-jp | 23.11.40.7 | 23544.196395 | 0.164298 | 35818.790852 | True | 218011.492168 |
| 1448050 | ind-us | 23.11.38.17 | 23495.150669 | 0.245884 | 38924.410667 | False | 158303.875966 |
| 51053 | adl-au | 23.11.38.17 | 8267.270184 | 0.194422 | 12145.571013 | False | 62470.002110 |
| 1267213 | hnd-jp | 13.107.54.1 | 5621.355446 | 0.164298 | 5953.264205 | True | 36234.612663 |
| 1267207 | hnd-jp | 13.107.40.1 | 5573.127149 | 0.164298 | 5953.264205 | True | 36234.612663 |
| 1267202 | hnd-jp | 13.107.16.1 | 5147.757655 | 0.164298 | 5910.908737 | True | 35976.815611 |
| 2206646 | nrt-jp | 158.220.129.11 | 6233.507414 | 0.164298 | 5764.573775 | False | 35086.146139 |
| 2206648 | nrt-jp | 158.220.131.11 | 6254.045312 | 0.164298 | 5764.573775 | False | 35086.146139 |
In [53]:
fcrp_links['hop_sub_region']
Out[53]:
1297759 Northern Europe
1297760 Eastern Asia
1297700 Northern Europe
1297701 South-eastern Asia
1298212 Northern Europe
...
2072739 Eastern Europe
2072932 Western Europe
2072933 Eastern Europe
2072718 Western Europe
2072719 Eastern Europe
Name: hop_sub_region, Length: 680327, dtype: object
In [54]:
# get next region for each hop
traceroutes_df['next_hop_sub_region'] = traceroutes_df.groupby(['tx_hostname', 'dst'])['hop_sub_region'].shift(-1)
In [55]:
# get the first hops of all candidate RP links
first_hops_of_crp = traceroutes_df.loc[fcrp_indexes]
first_hops_of_crp[['hop_sub_region', 'next_hop_sub_region']]
Out[55]:
| hop_sub_region | next_hop_sub_region | |
|---|---|---|
| 1297759 | Northern Europe | Eastern Asia |
| 1297700 | Northern Europe | South-eastern Asia |
| 1298212 | Northern Europe | South-eastern Asia |
| 1298727 | Northern America | Northern America |
| 1299219 | Northern Europe | Eastern Asia |
| ... | ... | ... |
| 2072650 | Western Europe | Sub-Saharan Africa |
| 2072970 | Western Europe | Eastern Europe |
| 2072738 | Western Europe | Eastern Europe |
| 2072932 | Western Europe | Eastern Europe |
| 2072718 | Western Europe | Eastern Europe |
347485 rows × 2 columns
In [56]:
first_hops_of_crp['next_hop_sub_region'].value_counts().head(10)
Out[56]:
next_hop_sub_region Northern America 162040 Western Europe 46905 Northern Europe 33184 Eastern Asia 22711 South-eastern Asia 16191 Southern Europe 15901 Sub-Saharan Africa 10101 Western Asia 8110 Southern Asia 5281 Australia and New Zealand 5105 Name: count, dtype: int64
In [57]:
first_hops_of_crp['hop_sub_region'].value_counts().head(10)
Out[57]:
hop_sub_region Northern America 161578 Western Europe 46045 Northern Europe 39479 Eastern Asia 23884 Southern Europe 18272 Southern Asia 8756 Sub-Saharan Africa 7955 South-eastern Asia 7358 Western Asia 6761 Australia and New Zealand 4644 Name: count, dtype: int64
In [58]:
first_hops_of_crp.groupby(['hop_sub_region', 'next_hop_sub_region']).size().nlargest(10)
Out[58]:
hop_sub_region next_hop_sub_region Northern America Northern America 156446 Western Europe Western Europe 40027 Northern Europe Northern Europe 30718 Eastern Asia Eastern Asia 20076 Southern Europe Southern Europe 15601 Sub-Saharan Africa Sub-Saharan Africa 7593 Western Asia Western Asia 6673 South-eastern Asia South-eastern Asia 5813 Southern Asia South-eastern Asia 4787 Australia and New Zealand Australia and New Zealand 3992 dtype: int64
In [59]:
inter_region_mask = first_hops_of_crp['hop_sub_region'] != first_hops_of_crp['next_hop_sub_region']
# Step 2: Apply the mask to create a new DataFrame of only inter-region links.
inter_region_links = first_hops_of_crp[inter_region_mask]
# Step 3: Perform the groupby and count on this filtered DataFrame.
top_10_inter_region_links = inter_region_links.groupby(
['hop_sub_region', 'next_hop_sub_region']
).size().nlargest(10)
print("--- Top 10 Inter-Region Links (Hop != Next-Hop) ---")
top_10_inter_region_links
--- Top 10 Inter-Region Links (Hop != Next-Hop) ---
Out[59]:
hop_sub_region next_hop_sub_region
Southern Asia South-eastern Asia 4787
Northern Europe Western Europe 3562
Eastern Asia South-eastern Asia 2768
Western Europe Eastern Europe 1795
Northern America South-eastern Asia 1634
Northern Europe Sub-Saharan Africa 1302
Southern Europe Western Europe 1228
Western Europe Northern Europe 1161
Northern America 1054
Northern Europe Eastern Europe 956
dtype: int64
In [60]:
# calculate region statistics for unique IP links
# Isolate the relevant columns for clarity and efficiency
link_data = first_hops_of_crp[['hop_addr', 'next_hop_addr', 'hop_sub_region', 'next_hop_sub_region']]
# Find all unique IP-level links. Each row in this new DataFrame is one distinct link.
unique_ip_links = link_data.drop_duplicates(subset=['hop_addr', 'next_hop_addr'])
print(f"Original number of rows: {len(first_hops_of_crp)}")
print(f"Number of unique IP-level links: {len(unique_ip_links)}")
Original number of rows: 347485 Number of unique IP-level links: 7000
In [61]:
# top 10 regions where RP originates
unique_ip_links['hop_sub_region'].value_counts().head(12)
Out[61]:
hop_sub_region Northern America 2065 Western Europe 1370 Northern Europe 1011 Eastern Asia 750 South-eastern Asia 372 Southern Asia 301 Southern Europe 257 Eastern Europe 98 Latin America and the Caribbean 75 Australia and New Zealand 65 Western Asia 57 Sub-Saharan Africa 33 Name: count, dtype: int64
In [62]:
# top 10 regions where RP ends
unique_ip_links['next_hop_sub_region'].value_counts().head(12)
Out[62]:
next_hop_sub_region Northern America 1840 Western Europe 1473 South-eastern Asia 810 Eastern Asia 701 Northern Europe 615 Eastern Europe 320 Western Asia 242 Southern Asia 211 Latin America and the Caribbean 211 Sub-Saharan Africa 114 Southern Europe 107 Australia and New Zealand 86 Name: count, dtype: int64
In [63]:
unique_ip_links['hop_sub_region'].nunique() # 12 regions total
Out[63]:
12
In [64]:
hop_counts = unique_ip_links['hop_sub_region'].value_counts().head(12)
next_hop_counts = unique_ip_links['next_hop_sub_region'].value_counts().head(12)
# Combine into a single DataFrame
combined_df = pd.DataFrame({
'RP start': hop_counts,
'RP end': next_hop_counts
})
combined_df['pct_difference'] = ((combined_df['RP end'] - combined_df['RP start']) / combined_df['RP start']) * 100
combined_df
Out[64]:
| RP start | RP end | pct_difference | |
|---|---|---|---|
| Australia and New Zealand | 65 | 86 | 32.307692 |
| Eastern Asia | 750 | 701 | -6.533333 |
| Eastern Europe | 98 | 320 | 226.530612 |
| Latin America and the Caribbean | 75 | 211 | 181.333333 |
| Northern America | 2065 | 1840 | -10.895884 |
| Northern Europe | 1011 | 615 | -39.169139 |
| South-eastern Asia | 372 | 810 | 117.741935 |
| Southern Asia | 301 | 211 | -29.900332 |
| Southern Europe | 257 | 107 | -58.365759 |
| Sub-Saharan Africa | 33 | 114 | 245.454545 |
| Western Asia | 57 | 242 | 324.561404 |
| Western Europe | 1370 | 1473 | 7.518248 |
In [65]:
# plot most often region links
unique_ip_links.groupby(['hop_sub_region', 'next_hop_sub_region']).size().nlargest(10)
Out[65]:
hop_sub_region next_hop_sub_region
Northern America Northern America 1525
Western Europe Western Europe 646
Northern Europe Northern Europe 339
Eastern Asia Eastern Asia 331
South-eastern Asia 301
Northern Europe Western Europe 253
Southern Asia South-eastern Asia 170
Northern America Eastern Asia 143
Western Europe Eastern Europe 139
Northern America 117
dtype: int64
In [66]:
inter_region_mask = unique_ip_links['hop_sub_region'] != unique_ip_links['next_hop_sub_region']
inter_region_links = unique_ip_links[inter_region_mask]
top_10_inter_region_links = inter_region_links.groupby(
['hop_sub_region', 'next_hop_sub_region']
).size().nlargest(10)
print("--- Top 10 Inter-Region Links (Hop != Next-Hop) ---")
top_10_inter_region_links
--- Top 10 Inter-Region Links (Hop != Next-Hop) ---
Out[66]:
hop_sub_region next_hop_sub_region
Eastern Asia South-eastern Asia 301
Northern Europe Western Europe 253
Southern Asia South-eastern Asia 170
Northern America Eastern Asia 143
Western Europe Eastern Europe 139
Northern America 117
Northern Europe 105
Northern Europe Eastern Europe 96
Southern Europe Western Europe 94
Northern America South-eastern Asia 91
dtype: int64
In [ ]: