import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

from pathlib import Path

# Configure Plotly to work better with HTML exports
pio.renderers.default = "notebook"

DATASET = Path(r"C:\Users\eluje\OneDrive\Documents\Lujean Important Docs\Portfolio-Mock Projects\lujean-portfolio1\data\sms_outreach_ranked.csv")
assert DATASET.exists(), f"Dataset not found: {DATASET}"

pd.options.display.float_format = lambda x: f"{x:,.2f}"

df = pd.read_csv(DATASET)
df.columns = [c.strip().lower() for c in df.columns]

print("Rows:", len(df))
df.head(10)

Rows: 11868

summary = pd.DataFrame({
    "column": df.columns,
    "dtype": [df[c].dtype for c in df.columns],
    "non_null": [df[c].notna().sum() for c in df.columns],
    "nulls": [df[c].isna().sum() for c in df.columns]
})
summary

if "outreach_priority" not in df.columns:
    if "outreach_score" in df.columns:
        def bucket(score):
            try:
                if score >= 60: return "High"
                if score >= 50: return "Medium"
                return "Low"
            except Exception:
                return np.nan
        df["outreach_priority"] = df["outreach_score"].apply(bucket)
    else:
        df["outreach_priority"] = np.nan

if "text_opt_in" in df.columns:
    df["text_opt_in"] = df["text_opt_in"].fillna(0).astype(int)

if "outreach_priority" in df.columns:
    pr = df["outreach_priority"].value_counts(dropna=False).rename_axis("priority").reset_index(name="count")
    fig = px.pie(pr, values="count", names="priority", hole=0.55,
                 title="Outreach Priority Distribution")
    fig.update_traces(textposition="inside", textinfo="percent+label")
    fig.show()
else:
    print("Column 'outreach_priority' not found; skipping donut chart.")

if set(["outreach_priority","text_opt_in"]).issubset(df.columns):
    grp = (df.groupby(["outreach_priority","text_opt_in"])["member_id"].count()
             .reset_index(name="count"))
    totals = grp.groupby("outreach_priority")["count"].transform("sum")
    grp["pct"] = grp["count"] / totals * 100

    fig = px.bar(grp.sort_values("outreach_priority"),
                 x="pct", y="outreach_priority",
                 color="text_opt_in", orientation="h",
                 title="Text Opt-In by Priority (share)",
                 labels={"pct":"% of members","outreach_priority":"Priority","text_opt_in":"Opt-in"})
    fig.update_layout(barmode="stack", xaxis_tickformat=".1f")
    fig.show()
else:
    print("Required columns not found; skipping stacked bars.")

if set(["outreach_priority","er_visit_count"]).issubset(df.columns):
    er = (df.groupby("outreach_priority")["er_visit_count"].mean()
            .reset_index(name="avg_er_visits"))
    fig = px.bar(er.sort_values("outreach_priority"),
                 x="outreach_priority", y="avg_er_visits",
                 title="Average ER Visits by Priority",
                 labels={"outreach_priority":"Priority","avg_er_visits":"Average ER visits"})
    fig.show()
else:
    print("Required columns not found; skipping ER chart.")

cols = [c for c in ["member_id","outreach_score","outreach_priority","hra_overdue_days","text_opt_in","er_visit_count","chronic_conditions"] if c in df.columns]
sdf = df.copy()
if "outreach_score" in sdf.columns:
    sdf = sdf.sort_values(["outreach_score","hra_overdue_days"] if "hra_overdue_days" in sdf.columns else ["outreach_score"], ascending=False)
top = sdf[cols].head(25) if cols else sdf.head(25)
top.reset_index(drop=True, inplace=True)
top

	member_id	hra_overdue_days	response_history	chronic_conditions	age	prior_hra_completed	high_risk_flag	recent_kickout_flag	er_visit_count	dsnp_flag	region_code	social_vulnerability_score	text_opt_in	outreach_score	outreach_priority
0	M00001	56	0	3	75	1	1	1	1	1	2	0.29	1	70.55	Medium
1	M00003	158	2	2	77	0	0	0	1	1	2	0.12	0	50.76	Low
2	M00004	109	1	5	70	1	0	0	1	1	1	0.05	1	60.89	Medium
3	M00006	20	4	3	67	0	0	0	0	1	4	0.25	1	30.47	Low
4	M00007	7	1	3	72	0	0	0	0	1	1	0.06	1	34.57	Low
5	M00008	241	1	5	70	1	0	0	0	1	1	0.13	1	64.90	Medium
6	M00009	110	1	3	79	1	0	0	0	1	3	0.25	1	60.94	Medium
7	M00010	147	5	2	66	1	0	0	1	1	2	0.20	1	38.40	Low
8	M00011	2	3	4	71	1	0	0	1	1	3	0.15	1	32.58	Low
9	M00012	365	4	1	71	1	0	0	0	1	2	0.05	1	42.00	Low

	member_id	outreach_score	outreach_priority	hra_overdue_days	text_opt_in	er_visit_count	chronic_conditions
0	M11558	109.10	High	206	1	3	6
1	M02985	105.14	High	94	0	2	6
2	M02461	104.00	High	365	1	2	4
3	M00882	102.89	High	133	1	2	5
4	M10369	100.94	High	110	0	3	6
5	M03301	100.52	High	68	0	5	6
6	M09233	99.25	High	315	1	2	5
7	M08209	98.23	High	43	1	3	6
8	M05146	98.16	High	254	1	3	5
9	M13461	98.00	High	365	1	1	5
10	M09914	97.98	High	111	1	2	6
11	M12501	97.88	High	197	1	2	4
12	M05104	97.78	High	48	1	3	5
13	M00721	97.55	High	334	1	2	6
14	M11739	97.50	High	150	0	2	5
15	M04144	97.27	High	213	0	2	3
16	M13593	97.06	High	303	1	1	5
17	M06440	96.93	High	90	1	2	6
18	M04450	96.73	High	157	0	2	4
19	M01368	95.78	High	287	0	2	5
20	M09755	95.45	High	30	1	2	5
21	M01849	95.30	High	261	0	1	6
22	M12948	95.29	High	29	0	3	5
23	M05712	95.00	High	365	1	2	3
24	M08129	94.92	High	242	1	2	5

SMS Outreach Pilot — Technical Notebook¶

Business Goal¶

Data Dictionary (columns expected)¶

Scoring & Priority Buckets (reproducible rules)¶

Overview Visuals (used in the BI dashboard)¶

1) Priority Distribution (donut)¶

2) Text Opt-In by Priority (stacked bars)¶

3) ER Utilization by Priority (bar)¶

Top Outreach Targets¶

Summary & Next Steps¶

	column	dtype	non_null
0	member_id	object	11868
1	hra_overdue_days	int64	11868
2	response_history	int64	11868
3	chronic_conditions	int64	11868
4	age	int64	11868
5	prior_hra_completed	int64	11868
6	managed_flag	int64	11868
7	high_risk_flag	int64	11868
8	recent_kickout_flag	int64	11868
9	er_visit_count	int64	11868
10	dsnp_flag	int64	11868
11	region_code	int64	11868
12	social_vulnerability_score	float64	11868
13	text_opt_in	int64	11868
14	outreach_score	float64	11868
15	outreach_priority	object	11868