Sunday, February 1, 2009

6-figure salary data analysis



So, I got bored the other day. I threw together this hack to pull $100K+ salary data and parse it ;-)

$ cat parse.salary.com.sh
#!/bin/sh

wget -q -O- "http://swz.salary.com/salarywizard/layouthtmls/swzl_salaryrangenarrow_50.html" | egrep -o '(/salarywizard/layouthtmls/swzl_salaryrangenarrowjob_50.*.html)' | while read line; do wget -q -O- "http://swz.salary.com${line}" | egrep -o '(/salarywizard/layoutscripts/swzl_compresult\.asp\?NarrowCode=.*=U\.S\. National Averages)' | while read line2; do echo "${line2}" | egrep -o '(JobTitle=.*&JobCode=)' | awk -F '=' '{print $2}' | awk -F '&' '{print $1}'; wget -q -O- "http://swz.salary.com${line2}" | egrep -o '(\$[0-9,]{6,10})' | head -n 4; done; done

$ head -n 20 parse.salary.com.out
Accounting+Director
$89,568
$104,653
$143,633
$164,037
Actuarial+Fellow
$93,330
$111,579
$141,639
$150,759
Controller+Assistant
$77,556
$92,822
$131,129
$150,738
Treasurer+Assistant
$78,507
$96,724
$143,398
$167,676

$ cat sort.salary.com.py
#!/usr/bin/env python

from sys import argv
from urllib import unquote

job_sal = {}
last_key = None

f=open(argv[1],'r')
for line in f:
data = unquote(line.replace('+',' ')).strip()
if data not in job_sal and data[0] != '$':
job_sal.setdefault(data)
last_key = data
else:
sals = job_sal.get(last_key)
if sals == None:
a = []
a.append(data.replace('$','').replace(',',''))
job_sal.update({last_key : a})
elif data[0] == '$' and len(sals) < 4:
sals.append(data.replace('$','').replace(',',''))
job_sal.update({last_key : sals})

for j,k in job_sal.items():
print '"%s","%s","%s","%s","%s"' % (j,k[0],k[1],k[2],k[3])

$ head -n 20 sort.salary.com.csv
"Chief Executive Officer",350880,486253,828000,1003771
"Surgeon - Cardiothoracic",215774,314650,568163,690043
"Surgeon - Neurology",265533,353988,570565,679293
"Chief Operating Officer",240482,327456,552136,669720
"Surgeon - Heart Transplant",218340,315173,527658,624282
"Top Subsidiary Executive",209290,291660,504038,615028
"Dean of Medicine",214448,295160,485869,578788
"Surgeon - Orthopedic",218443,305359,489585,570399
"Top Sector Executive",187968,265828,455365,550069
"Chief of Surgery",178696,260582,450487,521668
"Top Merchandising Executive",130767,190690,401304,517577
"Physician - Radiology",261552,308236,431592,497217
"Physician - Maternal/Fetal Medicine",251459,302061,429869,495629
"Top Administrative Executive",146348,213410,389013,480671
"Surgeon - Vascular",197974,260083,407081,478806
"Top International Executive",142501,207801,380857,471963
"Surgeon - Plastic Reconstructive",201327,256010,396415,469563
"Chief Financial Officer",171060,230261,382551,462002
"Physician - Gastroenterology",187368,247357,390600,461027
"Physician - Radiation Therapy",172623,240971,391381,459974

0 comments: