Project

General

Profile

Feature #1190 » nyc_tickets.py

Andre Mene, 09/25/2025 03:54 PM

 

# nyc_tickets.py
# Scrape NYC CityPay "Search by License Plate" for tickets, expand groups,
# collect EVERY individual row, and export to Excel.
#
# Column policy (per user):
# - Always drop Column C (3rd column) before saving each plate sheet.
# - Drop columns with headers: Pending Payment, Total Amount Due, Payment Amount,
# and any "Payment Amount: $..." variant (even if split across lines/spaces).
# - In "Violation #", remove "View Ticket" and keep only the numeric ticket number.
# - Reduce results wait to 10 seconds.

import argparse
import csv
import time
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
from playwright.sync_api import sync_playwright, Page

CITYPAY_URL = "https://a836-citypay.nyc.gov/citypay/Parking"


# -------------------- small helpers --------------------

def log(msg: str) -> None:
print(f"[tickets] {msg}")

def to_money(s: str) -> float:
try:
s = (s or "").replace("$", "").replace(",", "").strip()
return float(s) if s else 0.0
except Exception:
return 0.0

def read_plates_csv(path: Path) -> List[Tuple[str, str, str]]:
rows: List[Tuple[str, str, str]] = []
with open(path, newline="", encoding="utf-8-sig") as f:
for row in csv.DictReader(f):
plate = (row.get("plate") or "").strip()
state = (row.get("state") or "NY").strip()
ptype = (row.get("plate_type") or "--ALL--").strip()
if plate and state:
rows.append((plate, state, ptype))
return rows


# -------------------- debugging --------------------

def save_debug(page: Page, plate: str, tag: str) -> None:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
outdir = Path("debug")
outdir.mkdir(exist_ok=True)
html = outdir / f"{plate}_{tag}_{ts}.html"
png = outdir / f"{plate}_{tag}_{ts}.png"
try:
html.write_text(page.content(), encoding="utf-8", errors="ignore")
except Exception:
pass
try:
page.screenshot(path=str(png), full_page=True)
except Exception:
pass
log(f"Saved debug -> {html} {png}")


# -------------------- page/form ops --------------------

def goto_plate_form(page: Page) -> None:
page.goto(CITYPAY_URL, timeout=120_000, wait_until="domcontentloaded")
try:
page.get_by_role("tab", name="Search By License Plate").click(timeout=3000)
except Exception:
pass
page.wait_for_selector("#by-plate-form", timeout=10_000)
page.locator("input#plate-number[name='PLATE_NUMBER']").first.wait_for(
state="attached", timeout=8000
)

def set_plate_value(page: Page, value: str) -> None:
inp = page.locator("input#plate-number[name='PLATE_NUMBER']").first
try:
inp.fill("", timeout=1500)
inp.type(value, delay=15)
return
except Exception:
page.evaluate(
"""(val) => {
const el = document.querySelector("input#plate-number[name='PLATE_NUMBER']");
if (!el) return;
el.value = "";
el.dispatchEvent(new Event('input', {bubbles:true}));
el.value = val;
el.dispatchEvent(new Event('input', {bubbles:true}));
el.dispatchEvent(new Event('change', {bubbles:true}));
}""",
value,
)

def pick_state(page: Page, code: str) -> None:
try:
opener = page.locator("#PLATE_STATE_chosen .chosen-single").first
opener.click(timeout=1500)
search = page.locator("#PLATE_STATE_chosen .chosen-search input").first
search.wait_for(state="visible", timeout=1500)
search.fill("")
search.type(code, delay=25)
page.keyboard.press("Enter")
page.wait_for_timeout(120)
except Exception:
pass

page.evaluate(
"""(code) => {
const sel = document.querySelector('#PLATE_STATE');
if (sel) {
sel.value = code;
sel.dispatchEvent(new Event('change', {bubbles:true}));
}
const span = document.querySelector('#PLATE_STATE_chosen .chosen-single span');
if (span && code) {
const t = (span.textContent || '');
const tail = t.includes('-') ? t.split('-').pop().trim() : t.trim();
span.textContent = code + ' - ' + tail;
}
}""",
code,
)

try:
label = page.locator("#PLATE_STATE_chosen .chosen-single span").inner_text(timeout=800).strip()
if code not in label:
page.evaluate(
"""() => {
const s = document.querySelector('#PLATE_STATE');
if (s) s.style.display='block';
}"""
)
try:
page.locator("select#PLATE_STATE").select_option(value=code)
finally:
page.evaluate(
"""() => {
const s = document.querySelector('#PLATE_STATE');
if (s) s.style.display='none';
}"""
)
page.wait_for_timeout(100)
except Exception:
pass

def pick_plate_type(page: Page, label: str) -> None:
if not label or label == "--ALL--":
return
try:
page.locator("select#PLATE_TYPE[name='PLATE_TYPE']").select_option(label=label)
return
except Exception:
pass
page.evaluate(
"""(label) => {
const sel = document.querySelector('#PLATE_TYPE');
if (!sel) return;
const want = (label || '').trim();
for (const o of sel.options) {
const t = (o.textContent || '').trim();
if (t.startsWith(want)) { sel.value = o.value; break; }
}
sel.dispatchEvent(new Event('change', {bubbles:true}));
}""",
label,
)

def submit_search(page: Page) -> None:
page.get_by_role("button", name="Search").click()


# -------------------- navigation-safe waiting --------------------

def wait_for_results_or_error(page: Page, max_wait_s: int = 10) -> str:
"""
Polls for either the results URL or the inline form error without crashing
when the page is navigating (execution context destroyed).
Returns: 'results' | 'error' | 'timeout'
"""
deadline = time.time() + max_wait_s
while time.time() < deadline:
# Did we reach results?
try:
if "/searchResults" in page.url:
return "results"
except Exception:
time.sleep(0.2)
continue

# Is the red inline error visible on the form?
try:
has_error = page.evaluate(
"""() => {
const el = document.querySelector('form#by-plate-form .error-msg label.error');
return !!(el && el.offsetParent !== null && el.textContent.trim().length);
}"""
)
if has_error:
return "error"
except Exception:
pass

time.sleep(0.2)
return "timeout"


# -------------------- results scraping --------------------

def expand_all_groups(page: Page) -> None:
# Click group toggles a few times to make sure everything is expanded
for _ in range(3):
for el in page.locator("tr.dtrg-group, tr.dtrg-group .ico-wrapper, tr.dtrg-group u").all():
try:
el.click(timeout=150)
except Exception:
pass
page.wait_for_timeout(120)

def scrape_all_ticket_rows(page: Page) -> List[Dict[str, str]]:
# Evaluate in the DOM to grab ALL rows, skipping group headers
return page.evaluate("""
() => {
const out = [];
const tables = Array.from(document.querySelectorAll('table'));
for (const t of tables) {
let headers = Array.from(t.querySelectorAll('thead th')).map(th => (th.textContent||'').trim());
if (!headers.length) {
const first = t.querySelector('tr');
if (!first) continue;
headers = Array.from(first.querySelectorAll('th,td')).map(c => (c.textContent||'').trim());
}
if (!headers.length) continue;

const bodyRows = t.querySelectorAll('tbody tr');
const rows = bodyRows.length ? Array.from(bodyRows) : Array.from(t.querySelectorAll('tr')).slice(1);

for (const tr of rows) {
const cls = (tr.getAttribute('class')||'');
if (cls.includes('dtrg-group')) continue;
const cells = Array.from(tr.querySelectorAll('th,td')).map(td => (td.textContent||'').trim());
if (!cells.some(Boolean)) continue;

const row = {};
for (let i=0;i<headers.length;i++) row[headers[i]] = cells[i] ?? '';
out.push(row);
}
}
return out;
}
""")


# -------------------- run one plate --------------------

def run_one_plate(page: Page, plate: str, state: str, ptype: str, first_plate: bool, debug: bool) -> pd.DataFrame:
goto_plate_form(page)
set_plate_value(page, plate)
pick_state(page, state)
pick_plate_type(page, ptype)
submit_search(page)

if first_plate:
# If a captcha shows up the first time, let the user solve it
time.sleep(1)
try:
if "recaptcha" in page.content().lower():
input("\nSolve any CAPTCHA in the browser, wait for results, then press Enter here...")
except Exception:
pass

status = wait_for_results_or_error(page, max_wait_s=10)

if status == "error":
if debug:
save_debug(page, plate, "form_error")
return pd.DataFrame()

if status != "results":
if debug:
save_debug(page, plate, "timeout")
return pd.DataFrame()

# On results page
expand_all_groups(page)
rows = scrape_all_ticket_rows(page)
df = pd.DataFrame(rows)

if df.empty and debug:
save_debug(page, plate, "no_rows")

return df


# -------------------- main --------------------

def main():
ap = argparse.ArgumentParser(description="Scrape NYC CityPay tickets by plate into Excel")
ap.add_argument("--plates", default="plates.csv", help="CSV with columns: plate,state,plate_type")
ap.add_argument("--out", default="tickets_report.xlsx", help="Output Excel path")
ap.add_argument("--debug", action="store_true", help="Save HTML/PNG for errors/no rows")
args = ap.parse_args()

plate_rows = read_plates_csv(Path(args.plates))
if not plate_rows:
print("No plates found. Create plates.csv with columns: plate,state,plate_type")
return

with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=110)
ctx = browser.new_context(viewport={"width": 1360, "height": 900})
page = ctx.new_page()

log(f"Opening {CITYPAY_URL}")
page.goto(CITYPAY_URL, timeout=120_000, wait_until="domcontentloaded")

per_plate: List[Tuple[str, pd.DataFrame]] = []
summary: List[Dict[str, object]] = []
first = True

for i, (plate, state, ptype) in enumerate(plate_rows, start=1):
log(f"[{i}/{len(plate_rows)}] {plate} {state}")
df = run_one_plate(page, plate, state, ptype, first, args.debug)
first = False

# Return to form for the next plate
goto_plate_form(page)

if df.empty:
summary.append({"plate": plate, "state": state, "rows": 0, "total_due": 0.0})
continue

# normalize and compute totals
df.insert(0, "PLATE", plate)
df.insert(1, "STATE", state)

for col in list(df.columns):
if "AMOUNT" in col.upper():
df[col] = df[col].map(to_money)

total_due = 0.0
for col in df.columns:
if "TOTAL AMOUNT DUE" in col.upper():
total_due = float(df[col].sum())
break

per_plate.append((plate, df))
summary.append({
"plate": plate,
"state": state,
"rows": len(df),
"total_due": round(total_due, 2),
})

# Write Excel
out_path = Path(args.out)
with pd.ExcelWriter(out_path, engine="openpyxl") as xl:
pd.DataFrame(summary).sort_values(
["total_due", "plate"], ascending=[False, True]
).to_excel(xl, index=False, sheet_name="Summary")

used = set()
for plate, df in per_plate:
sheet = plate[:28]
n = 2
while sheet in used:
sheet = f"{plate[:26]}-{n}"
n += 1
used.add(sheet)

# ------------ FINAL column cleanup (header + content) ------------
def _norm_hdr(s: str) -> str:
return re.sub(r"\s+", " ", str(s)).strip().lower()

drop_names = {"pending payment", "total amount due", "payment amount"}
drop_regex = re.compile(r"(?i)^\s*payment\s*amount\s*:\s*\$[0-9,.\s]+\s*$")

to_drop = []
for c in list(df.columns):
h = _norm_hdr(c)
if h in drop_names or drop_regex.match(h):
to_drop.append(c)

# Always remove Column C (3rd column)
if len(df.columns) >= 3:
col_c = df.columns[2]
if col_c not in to_drop:
to_drop.append(col_c)

if to_drop:
df = df.drop(columns=to_drop)

# Clean 'Violation #' column: remove 'View Ticket', keep only numeric ticket
for _col in list(df.columns):
if _norm_hdr(_col) == "violation #":
s = df[_col].astype(str)
s = s.str.replace(r"\s*view\s*ticket\s*", "", regex=True, flags=re.IGNORECASE)
digits = s.str.extract(r"(\d{5,})", expand=False)
df[_col] = digits.fillna(s.str.strip())
break

# -----------------------------------------------------------------
df.to_excel(xl, index=False, sheet_name=sheet)

log(f"Saved report -> {out_path.resolve()}")
ctx.close()
browser.close()


if __name__ == "__main__":
main()
(2-2/4)