import pandas as pd
data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]
data = {}
for i,file in enumerate(data_files):
    file_key = file[:-4] #remove .csv in data dictionary
    if (i==0) or (i==1): 
        #ap_2010.csv and class_size are semicolon separated
        data[file_key] = pd.read_csv(f"schools_dirty/{file}", delimiter=';') 
    else:
        data[file_key] = pd.read_csv(f"schools_dirty/{file}")


for key in data.keys():
    print(data[key].head())
    print('\n\n\n')

      DBN                             SchoolName AP Test Takers   \
0  01M448           UNIVERSITY NEIGHBORHOOD H.S.              39   
1  01M450                 EAST SIDE COMMUNITY HS              19   
2  01M515                    LOWER EASTSIDE PREP              24   
3  01M539         NEW EXPLORATIONS SCI,TECH,MATH             255   
4  02M296  High School of Hospitality Management             NaN   

  Total Exams Taken Number of Exams with scores 3 4 or 5  
0                49                                   10  
1                21                                  NaN  
2                26                                   24  
3               377                                  191  
4               NaN                                  NaN  


   CSD BOROUGH SCHOOL CODE                SCHOOL NAME GRADE  PROGRAM TYPE  \
0    1       M        M015  P.S. 015 Roberto Clemente     0K       GEN ED   
1    1       M        M015  P.S. 015 Roberto Clemente     0K          CTT   
2    1       M        M015  P.S. 015 Roberto Clemente     01       GEN ED   
3    1       M        M015  P.S. 015 Roberto Clemente     01          CTT   
4    1       M        M015  P.S. 015 Roberto Clemente     02       GEN ED   

  CORE SUBJECT (MS CORE and 9-12 ONLY) CORE COURSE (MS CORE and 9-12 ONLY)  \
0                                    -                                   -   
1                                    -                                   -   
2                                    -                                   -   
3                                    -                                   -   
4                                    -                                   -   

  SERVICE CATEGORY(K-9* ONLY) NUMBER OF STUDENTS / SEATS FILLED  \
0                           -                                19   
1                           -                                21   
2                           -                                17   
3                           -                                17   
4                           -                                15   

   NUMBER OF SECTIONS  AVERAGE CLASS SIZE  SIZE OF SMALLEST CLASS  \
0                 1.0                19.0                    19.0   
1                 1.0                21.0                    21.0   
2                 1.0                17.0                    17.0   
3                 1.0                17.0                    17.0   
4                 1.0                15.0                    15.0   

   SIZE OF LARGEST CLASS DATA SOURCE  SCHOOLWIDE PUPIL-TEACHER RATIO  
0                   19.0         ATS                             NaN  
1                   21.0         ATS                             NaN  
2                   17.0         ATS                             NaN  
3                   17.0         ATS                             NaN  
4                   15.0         ATS                             NaN  


      DBN                       Name  schoolyear fl_percent  frl_percent  \
0  01M015  P.S. 015 ROBERTO CLEMENTE    20052006       89.4          NaN   
1  01M015  P.S. 015 ROBERTO CLEMENTE    20062007       89.4          NaN   
2  01M015  P.S. 015 ROBERTO CLEMENTE    20072008       89.4          NaN   
3  01M015  P.S. 015 ROBERTO CLEMENTE    20082009       89.4          NaN   
4  01M015  P.S. 015 ROBERTO CLEMENTE    20092010                    96.5   

   total_enrollment prek   k grade1 grade2  ... black_num black_per  \
0               281   15  36     40     33  ...        74      26.3   
1               243   15  29     39     38  ...        68      28.0   
2               261   18  43     39     36  ...        77      29.5   
3               252   17  37     44     32  ...        75      29.8   
4               208   16  40     28     32  ...        67      32.2   

  hispanic_num hispanic_per white_num white_per male_num male_per female_num  \
0          189         67.3         5       1.8    158.0     56.2      123.0   
1          153         63.0         4       1.6    140.0     57.6      103.0   
2          157         60.2         7       2.7    143.0     54.8      118.0   
3          149         59.1         7       2.8    149.0     59.1      103.0   
4          118         56.7         6       2.9    124.0     59.6       84.0   

  female_per  
0       43.8  
1       42.4  
2       45.2  
3       40.9  
4       40.4  

[5 rows x 38 columns]


    Demographic     DBN                            School Name    Cohort  \
0  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2003   
1  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2004   
2  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2005   
3  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2006   
4  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL  2006 Aug   

   Total Cohort Total Grads - n  Total Grads - % of cohort Total Regents - n  \
0             5               s                        NaN                 s   
1            55              37                       67.3                17   
2            64              43                       67.2                27   
3            78              43                       55.1                36   
4            78              44                       56.4                37   

   Total Regents - % of cohort  Total Regents - % of grads  ...  \
0                          NaN                         NaN  ...   
1                         30.9                        45.9  ...   
2                         42.2                        62.8  ...   
3                         46.2                        83.7  ...   
4                         47.4                        84.1  ...   

  Regents w/o Advanced - n  Regents w/o Advanced - % of cohort  \
0                        s                                 NaN   
1                       17                                30.9   
2                       27                                42.2   
3                       36                                46.2   
4                       37                                47.4   

   Regents w/o Advanced - % of grads Local - n  Local - % of cohort  \
0                                NaN         s                  NaN   
1                               45.9        20                 36.4   
2                               62.8        16                 25.0   
3                               83.7         7                  9.0   
4                               84.1         7                  9.0   

   Local - % of grads Still Enrolled - n  Still Enrolled - % of cohort  \
0                 NaN                  s                           NaN   
1                54.1                 15                          27.3   
2                37.2                  9                          14.1   
3                16.3                 16                          20.5   
4                15.9                 15                          19.2   

   Dropped Out - n Dropped Out - % of cohort  
0                s                       NaN  
1                3                       5.5  
2                9                      14.1  
3               11                      14.1  
4               11                      14.1  

[5 rows x 23 columns]


      dbn                                        school_name    borough  \
0  27Q260          Frederick Douglass Academy VI High School     Queens   
1  21K559        Life Academy High School for Film and Music   Brooklyn   
2  16K393     Frederick Douglass Academy IV Secondary School   Brooklyn   
3  08X305                               Pablo Neruda Academy      Bronx   
4  03M485  Fiorello H. LaGuardia High School of Music & A...  Manhattan   

  building_code  phone_number    fax_number  grade_span_min  grade_span_max  \
0          Q465  718-471-2154  718-471-2890             9.0              12   
1          K400  718-333-7750  718-333-7775             9.0              12   
2          K026  718-574-2820  718-574-2821             9.0              12   
3          X450  718-824-1682  718-824-1663             9.0              12   
4          M485  212-496-0700  212-724-5748             9.0              12   

   expgrade_span_min  expgrade_span_max  ... priority08 priority09 priority10  \
0                NaN                NaN  ...        NaN        NaN        NaN   
1                NaN                NaN  ...        NaN        NaN        NaN   
2                NaN                NaN  ...        NaN        NaN        NaN   
3                NaN                NaN  ...        NaN        NaN        NaN   
4                NaN                NaN  ...        NaN        NaN        NaN   

                                          Location 1 Community Board  \
0  8 21 Bay 25 Street\nFar Rockaway, NY 11691\n(4...            14.0   
1  2630 Benson Avenue\nBrooklyn, NY 11214\n(40.59...            13.0   
2  1014 Lafayette Avenue\nBrooklyn, NY 11221\n(40...             3.0   
3  1980 Lafayette Avenue\nBronx, NY 10473\n(40.82...             9.0   
4  100 Amsterdam Avenue\nNew York, NY 10023\n(40....             7.0   

   Council District Census Tract        BIN           BBL  \
0              31.0     100802.0  4300730.0  4.157360e+09   
1              47.0        306.0  3186454.0  3.068830e+09   
2              36.0        291.0  3393805.0  3.016160e+09   
3              18.0         16.0  2022205.0  2.036040e+09   
4               6.0        151.0  1030341.0  1.011560e+09   

                                                 NTA  
0  Far Rockaway-Bayswater                        ...  
1  Gravesend                                     ...  
2  Stuyvesant Heights                            ...  
3  Soundview-Castle Hill-Clason Point-Harding Par...  
4  Lincoln Square                                ...  

[5 rows x 64 columns]


      DBN                                    SCHOOL NAME  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   
3  01M458                      FORSYTH SATELLITE ACADEMY   
4  01M509                        MARTA VALLE HIGH SCHOOL   

  Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score  \
0                     29                             355                 404   
1                     91                             383                 423   
2                     70                             377                 402   
3                      7                             414                 401   
4                     44                             390                 433   

  SAT Writing Avg. Score  
0                    363  
1                    366  
2                    370  
3                    359  
4                    384


all_survey = pd.read_csv('schools/survey_all.txt',delimiter = '\t', encoding = 'windows-1252')
d75_survey = pd.read_csv('schools/survey_d75.txt',delimiter = '\t', encoding = 'windows-1252')
survey = pd.concat([all_survey,d75_survey],axis = 0)
print(survey.head())

      dbn    bn                      schoolname  d75 studentssurveyed  \
0  01M015  M015       P.S. 015 Roberto Clemente    0               No   
1  01M019  M019             P.S. 019 Asher Levy    0               No   
2  01M020  M020            P.S. 020 Anna Silver    0               No   
3  01M034  M034  P.S. 034 Franklin D. Roosevelt    0              Yes   
4  01M063  M063       P.S. 063 William McKinley    0               No   

   highschool                  schooltype  rr_s  rr_t  rr_p  ...  s_q14_2  \
0         0.0           Elementary School   NaN    88    60  ...      NaN   
1         0.0           Elementary School   NaN   100    60  ...      NaN   
2         0.0           Elementary School   NaN    88    73  ...      NaN   
3         0.0  Elementary / Middle School  89.0    73    50  ...      NaN   
4         0.0           Elementary School   NaN   100    60  ...      NaN   

   s_q14_3  s_q14_4  s_q14_5  s_q14_6  s_q14_7  s_q14_8  s_q14_9  s_q14_10  \
0      NaN      NaN      NaN      NaN      NaN      NaN      NaN       NaN   
1      NaN      NaN      NaN      NaN      NaN      NaN      NaN       NaN   
2      NaN      NaN      NaN      NaN      NaN      NaN      NaN       NaN   
3      NaN      NaN      NaN      NaN      NaN      NaN      NaN       NaN   
4      NaN      NaN      NaN      NaN      NaN      NaN      NaN       NaN   

   s_q14_11  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  

[5 rows x 2773 columns]


good_cols = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", 
            "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", 
            "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", 
            "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", 
            "eng_tot_11", "aca_tot_11"]


survey['DBN'] = survey['dbn']
survey = survey[good_cols]
data['survey'] = survey

/tmp/ipykernel_397/705155496.py:1: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  survey['DBN'] = survey['dbn']


data['hs_directory']['DBN'] = data['hs_directory']['dbn']


data['class_size'].head()


data['hs_directory']['DBN']

0      27Q260
1      21K559
2      16K393
3      08X305
4      03M485
        ...  
430    02M300
431    09X412
432    32K549
433    02M407
434    21K344
Name: DBN, Length: 435, dtype: object


data['class_size']['padded_CSD'] =  data['class_size']['CSD'].apply(
    lambda x: '0'+str(x) if len(str(x))<2 else str(x))
    
data['class_size']['DBN'] = data['class_size']['padded_CSD'] + data['class_size']['SCHOOL CODE']


data['class_size']['DBN']

0        01M015
1        01M015
2        01M015
3        01M015
4        01M015
          ...  
27606    32K564
27607    32K564
27608    32K564
27609    32K564
27610    32K564
Name: DBN, Length: 27611, dtype: object


data['sat_results'].head()


to_numeric_cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
#convert each score column to numeric 
for col in to_numeric_cols:
    data['sat_results'][col] = pd.to_numeric(data['sat_results'][col], errors = 'coerce')
#initialize a total score column 
data['sat_results']['sat_score'] = 0
#add up the three columns to create a total score 
for col in to_numeric_cols:
    data['sat_results']['sat_score'] += data['sat_results'][col]
    
print(data['sat_results']['sat_score'].head())

0    1122.0
1    1172.0
2    1149.0
3    1174.0
4    1207.0
Name: sat_score, dtype: float64


print(data['hs_directory']['Location 1'].head())

0    8 21 Bay 25 Street\nFar Rockaway, NY 11691\n(4...
1    2630 Benson Avenue\nBrooklyn, NY 11214\n(40.59...
2    1014 Lafayette Avenue\nBrooklyn, NY 11221\n(40...
3    1980 Lafayette Avenue\nBronx, NY 10473\n(40.82...
4    100 Amsterdam Avenue\nNew York, NY 10023\n(40....
Name: Location 1, dtype: object


import re
def extract_lat(string):
    coords = re.findall("\(.+\)", string)[0] #match desried regex pattern
    coords = coords.replace('(','') #remove parenthetical, commas
    coords = coords.replace(')','')
    coords = coords.replace(',','')
    coord_list = coords.split(' ') #create into list [latitude, longitude]
    latitude = coord_list[0]
    return latitude

def extract_lon(string):
    coords = re.findall("\(.+\)", string)[0] #match desried regex pattern
    coords = coords.replace('(','') #remove parenthetical, commas
    coords = coords.replace(')','')
    coords = coords.replace(',','')
    coord_list = coords.split(' ') #create into list [latitude, longitude]
    longitude = coord_list[1]
    return longitude


#create lat and lon columns
data['hs_directory']['lat'] = data['hs_directory']['Location 1'].apply(extract_lat)
data['hs_directory']['lon'] = data['hs_directory']['Location 1'].apply(extract_lon)

#convert new columns to numeric
data['hs_directory']['lat'] = pd.to_numeric(data['hs_directory']['lat'], errors = 'coerce')
data['hs_directory']['lon'] = pd.to_numeric(data['hs_directory']['lon'], errors = 'coerce')

data['hs_directory'][['lat','lon']]


for df in data.keys():
    print(f"nonunique values in {df}: {data[df]['DBN'].duplicated().any()}")

nonunique values in ap_2010: True
nonunique values in class_size: True
nonunique values in demographics: True
nonunique values in graduation: True
nonunique values in hs_directory: False
nonunique values in sat_results: False
nonunique values in survey: False


data['class_size'].head()


class_size = data['class_size']
class_size = class_size[class_size['GRADE ']=='09-12']
class_size = class_size[class_size['PROGRAM TYPE'] == 'GEN ED']
print(class_size.head())

     CSD BOROUGH SCHOOL CODE                                    SCHOOL NAME  \
225    1       M        M292  Henry Street School for International Studies   
226    1       M        M292  Henry Street School for International Studies   
227    1       M        M292  Henry Street School for International Studies   
228    1       M        M292  Henry Street School for International Studies   
229    1       M        M292  Henry Street School for International Studies   

    GRADE  PROGRAM TYPE CORE SUBJECT (MS CORE and 9-12 ONLY)  \
225  09-12       GEN ED                              ENGLISH   
226  09-12       GEN ED                              ENGLISH   
227  09-12       GEN ED                              ENGLISH   
228  09-12       GEN ED                              ENGLISH   
229  09-12       GEN ED                                 MATH   

    CORE COURSE (MS CORE and 9-12 ONLY) SERVICE CATEGORY(K-9* ONLY)  \
225                           English 9                           -   
226                          English 10                           -   
227                          English 11                           -   
228                          English 12                           -   
229                  Integrated Algebra                           -   

    NUMBER OF STUDENTS / SEATS FILLED  NUMBER OF SECTIONS  AVERAGE CLASS SIZE  \
225                                63                 3.0                21.0   
226                                79                 3.0                26.3   
227                                38                 2.0                19.0   
228                                69                 3.0                23.0   
229                                53                 3.0                17.7   

     SIZE OF SMALLEST CLASS  SIZE OF LARGEST CLASS DATA SOURCE  \
225                    19.0                   25.0       STARS   
226                    24.0                   31.0       STARS   
227                    16.0                   22.0       STARS   
228                    13.0                   30.0       STARS   
229                    16.0                   21.0       STARS   

     SCHOOLWIDE PUPIL-TEACHER RATIO padded_CSD     DBN  
225                             NaN         01  01M292  
226                             NaN         01  01M292  
227                             NaN         01  01M292  
228                             NaN         01  01M292  
229                             NaN         01  01M292


import numpy as np
class_size = class_size.groupby('DBN').agg(np.mean)
class_size.reset_index(inplace = True)
data['class_size'] = class_size
print(data['class_size'].head())

      DBN  CSD  NUMBER OF SECTIONS  AVERAGE CLASS SIZE  \
0  01M292  1.0            4.000000           22.564286   
1  01M332  1.0            2.000000           22.000000   
2  01M378  1.0            1.000000           33.000000   
3  01M448  1.0            4.750000           22.231250   
4  01M450  1.0            2.733333           21.200000   

   SIZE OF SMALLEST CLASS  SIZE OF LARGEST CLASS  \
0                   18.50              26.571429   
1                   21.00              23.500000   
2                   33.00              33.000000   
3                   18.25              27.062500   
4                   19.40              22.866667   

   SCHOOLWIDE PUPIL-TEACHER RATIO  
0                             NaN  
1                             NaN  
2                             NaN  
3                             NaN  
4                             NaN


for df in data.keys():
    print(f"nonunique values in {df}: {data[df]['DBN'].duplicated().any()}")

nonunique values in ap_2010: True
nonunique values in class_size: False
nonunique values in demographics: True
nonunique values in graduation: True
nonunique values in hs_directory: False
nonunique values in sat_results: False
nonunique values in survey: False


data['demographics'].head()


data['demographics'] = data['demographics'][data['demographics']['schoolyear'] == 20112012]
print(data['demographics'].head())

       DBN                                              Name  schoolyear  \
6   01M015  P.S. 015 ROBERTO CLEMENTE                           20112012   
13  01M019  P.S. 019 ASHER LEVY                                 20112012   
20  01M020  PS 020 ANNA SILVER                                  20112012   
27  01M034  PS 034 FRANKLIN D ROOSEVELT                         20112012   
35  01M063  PS 063 WILLIAM MCKINLEY                             20112012   

   fl_percent  frl_percent  total_enrollment prek    k grade1 grade2  ...  \
6         NaN         89.4               189   13   31     35     28  ...   
13        NaN         61.5               328   32   46     52     54  ...   
20        NaN         92.5               626   52  102    121     87  ...   
27        NaN         99.7               401   14   34     38     36  ...   
35        NaN         78.9               176   18   20     30     21  ...   

   black_num black_per hispanic_num hispanic_per white_num white_per male_num  \
6         63      33.3          109         57.7         4       2.1     97.0   
13        81      24.7          158         48.2        28       8.5    147.0   
20        55       8.8          357         57.0        16       2.6    330.0   
27        90      22.4          275         68.6         8       2.0    204.0   
35        41      23.3          110         62.5        15       8.5     97.0   

   male_per female_num female_per  
6      51.3       92.0       48.7  
13     44.8      181.0       55.2  
20     52.7      296.0       47.3  
27     50.9      197.0       49.1  
35     55.1       79.0       44.9  

[5 rows x 38 columns]


data['graduation'].head()


data['graduation']['Cohort'].value_counts()

2006 Aug    4131
2006        4130
2005        3963
2004        3708
2003        3432
2002        3095
2001        2637
Name: Cohort, dtype: int64


data['graduation']=data['graduation'][data['graduation']['Cohort'] == '2006']
data['graduation']=data['graduation'][data['graduation']['Demographic'] == 'Total Cohort']
print(data['graduation'].head())

     Demographic     DBN                            School Name Cohort  \
3   Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL   2006   
10  Total Cohort  01M448    UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   2006   
17  Total Cohort  01M450             EAST SIDE COMMUNITY SCHOOL   2006   
24  Total Cohort  01M509                MARTA VALLE HIGH SCHOOL   2006   
31  Total Cohort  01M515  LOWER EAST SIDE PREPARATORY HIGH SCHO   2006   

    Total Cohort Total Grads - n  Total Grads - % of cohort Total Regents - n  \
3             78              43                       55.1                36   
10           124              53                       42.7                42   
17            90              70                       77.8                67   
24            84              47                       56.0                40   
31           193             105                       54.4                91   

    Total Regents - % of cohort  Total Regents - % of grads  ...  \
3                          46.2                        83.7  ...   
10                         33.9                        79.2  ...   
17                         74.4                        95.7  ...   
24                         47.6                        85.1  ...   
31                         47.2                        86.7  ...   

   Regents w/o Advanced - n  Regents w/o Advanced - % of cohort  \
3                        36                                46.2   
10                       34                                27.4   
17                       67                                74.4   
24                       23                                27.4   
31                       22                                11.4   

    Regents w/o Advanced - % of grads Local - n  Local - % of cohort  \
3                                83.7         7                  9.0   
10                               64.2        11                  8.9   
17                               95.7         3                  3.3   
24                               48.9         7                  8.3   
31                               21.0        14                  7.3   

    Local - % of grads Still Enrolled - n  Still Enrolled - % of cohort  \
3                 16.3                 16                          20.5   
10                20.8                 46                          37.1   
17                 4.3                 15                          16.7   
24                14.9                 25                          29.8   
31                13.3                 53                          27.5   

    Dropped Out - n Dropped Out - % of cohort  
3                11                      14.1  
10               20                      16.1  
17                5                       5.6  
24                5                       6.0  
31               35                      18.1  

[5 rows x 23 columns]


for df in data.keys():
    print(f"nonunique values in {df}: {data[df]['DBN'].duplicated().any()}")

nonunique values in ap_2010: True
nonunique values in class_size: False
nonunique values in demographics: False
nonunique values in graduation: False
nonunique values in hs_directory: False
nonunique values in sat_results: False
nonunique values in survey: False


print(data['ap_2010'].columns.to_list())
data['ap_2010'].head()

['DBN', 'SchoolName', 'AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']


cols = data['ap_2010'].columns.to_list()[-3:] #get last three columns
#convert these columns to numeric
for col in cols:
    data['ap_2010'][col] = pd.to_numeric(data['ap_2010'][col], errors = 'coerce') 
    
print(data['ap_2010'].dtypes)

DBN                                      object
SchoolName                               object
AP Test Takers                          float64
Total Exams Taken                       float64
Number of Exams with scores 3 4 or 5    float64
dtype: object


data['ap_2010'][data['ap_2010']['DBN'].duplicated(keep = False)]


ap_2010 = data['ap_2010']
ap_2010.drop(index = 52, inplace = True)
ap_2010.reset_index(inplace = True)
data['ap_2010'] = ap_2010


for df in data.keys():
    print(f"nonunique values in {df}: {data[df]['DBN'].duplicated().any()}")

nonunique values in ap_2010: False
nonunique values in class_size: False
nonunique values in demographics: False
nonunique values in graduation: False
nonunique values in hs_directory: False
nonunique values in sat_results: False
nonunique values in survey: False


combined = data["sat_results"]
combined = combined.merge(data['ap_2010'],how = 'left')
combined = combined.merge(data['graduation'], how = 'left')
print(combined.head())
print(combined.shape)

      DBN                                    SCHOOL NAME  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   
3  01M458                      FORSYTH SATELLITE ACADEMY   
4  01M509                        MARTA VALLE HIGH SCHOOL   

  Num of SAT Test Takers  SAT Critical Reading Avg. Score  \
0                     29                            355.0   
1                     91                            383.0   
2                     70                            377.0   
3                      7                            414.0   
4                     44                            390.0   

   SAT Math Avg. Score  SAT Writing Avg. Score  sat_score  index  \
0                404.0                   363.0     1122.0    NaN   
1                423.0                   366.0     1172.0    0.0   
2                402.0                   370.0     1149.0    1.0   
3                401.0                   359.0     1174.0    NaN   
4                433.0                   384.0     1207.0    NaN   

                     SchoolName  AP Test Takers   ...  \
0                           NaN              NaN  ...   
1  UNIVERSITY NEIGHBORHOOD H.S.             39.0  ...   
2        EAST SIDE COMMUNITY HS             19.0  ...   
3                           NaN              NaN  ...   
4                           NaN              NaN  ...   

   Regents w/o Advanced - n  Regents w/o Advanced - % of cohort  \
0                        36                                46.2   
1                        34                                27.4   
2                        67                                74.4   
3                       NaN                                 NaN   
4                        23                                27.4   

  Regents w/o Advanced - % of grads Local - n Local - % of cohort  \
0                              83.7         7                 9.0   
1                              64.2        11                 8.9   
2                              95.7         3                 3.3   
3                               NaN       NaN                 NaN   
4                              48.9         7                 8.3   

   Local - % of grads Still Enrolled - n  Still Enrolled - % of cohort  \
0                16.3                 16                          20.5   
1                20.8                 46                          37.1   
2                 4.3                 15                          16.7   
3                 NaN                NaN                           NaN   
4                14.9                 25                          29.8   

  Dropped Out - n  Dropped Out - % of cohort  
0              11                       14.1  
1              20                       16.1  
2               5                        5.6  
3             NaN                        NaN  
4               5                        6.0  

[5 rows x 34 columns]
(478, 34)


combined = combined.merge(data['class_size'], how = 'inner')
combined = combined.merge(data['demographics'], how = 'inner')
combined = combined.merge(data['survey'], how = 'inner')
combined = combined.merge(data['hs_directory'], how = 'inner')
print(combined.head())
print(combined.shape)

      DBN                                        SCHOOL NAME  \
0  01M292      HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448                UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                         EAST SIDE COMMUNITY SCHOOL   
3  01M509                            MARTA VALLE HIGH SCHOOL   
4  01M539  NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...   

  Num of SAT Test Takers  SAT Critical Reading Avg. Score  \
0                     29                            355.0   
1                     91                            383.0   
2                     70                            377.0   
3                     44                            390.0   
4                    159                            522.0   

   SAT Math Avg. Score  SAT Writing Avg. Score  sat_score  index  \
0                404.0                   363.0     1122.0    NaN   
1                423.0                   366.0     1172.0    0.0   
2                402.0                   370.0     1149.0    1.0   
3                433.0                   384.0     1207.0    NaN   
4                574.0                   525.0     1621.0    3.0   

                       SchoolName  AP Test Takers   ...  priority10  \
0                             NaN              NaN  ...         NaN   
1    UNIVERSITY NEIGHBORHOOD H.S.             39.0  ...         NaN   
2          EAST SIDE COMMUNITY HS             19.0  ...         NaN   
3                             NaN              NaN  ...         NaN   
4  NEW EXPLORATIONS SCI,TECH,MATH            255.0  ...         NaN   

                                          Location 1 Community Board  \
0  220 Henry Street\nNew York, NY 10002\n(40.7137...             3.0   
1  200 Monroe Street\nNew York, NY 10002\n(40.712...             3.0   
2  420 East 12 Street\nNew York, NY 10009\n(40.72...             3.0   
3  145 Stanton Street\nNew York, NY 10002\n(40.72...             3.0   
4  111 Columbia Street\nNew York, NY 10002\n(40.7...             3.0   

  Council District Census Tract        BIN           BBL  \
0              1.0        201.0  1003223.0  1.002690e+09   
1              1.0        202.0  1003214.0  1.002590e+09   
2              2.0         34.0  1005974.0  1.004390e+09   
3              1.0       3001.0  1004323.0  1.003540e+09   
4              2.0       2201.0  1004070.0  1.003350e+09   

                                                 NTA        lat        lon  
0  Lower East Side                               ...  40.713764 -73.985260  
1  Lower East Side                               ...  40.712332 -73.984797  
2  East Village                                  ...  40.729783 -73.983041  
3  Chinatown                                     ...  40.720569 -73.985673  
4  Lower East Side                               ...  40.718725 -73.979426  

[5 rows x 165 columns]
(362, 165)


means = combined.mean()
combined = combined.fillna(means)
combined = combined.fillna(0)
print(combined.head())

      DBN                                        SCHOOL NAME  \
0  01M292      HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448                UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                         EAST SIDE COMMUNITY SCHOOL   
3  01M509                            MARTA VALLE HIGH SCHOOL   
4  01M539  NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...   

  Num of SAT Test Takers  SAT Critical Reading Avg. Score  \
0                     29                            355.0   
1                     91                            383.0   
2                     70                            377.0   
3                     44                            390.0   
4                    159                            522.0   

   SAT Math Avg. Score  SAT Writing Avg. Score  sat_score       index  \
0                404.0                   363.0     1122.0  131.502203   
1                423.0                   366.0     1172.0    0.000000   
2                402.0                   370.0     1149.0    1.000000   
3                433.0                   384.0     1207.0  131.502203   
4                574.0                   525.0     1621.0    3.000000   

                       SchoolName  AP Test Takers   ...  priority10  \
0                               0       107.419512  ...         0.0   
1    UNIVERSITY NEIGHBORHOOD H.S.        39.000000  ...         0.0   
2          EAST SIDE COMMUNITY HS        19.000000  ...         0.0   
3                               0       107.419512  ...         0.0   
4  NEW EXPLORATIONS SCI,TECH,MATH       255.000000  ...         0.0   

                                          Location 1 Community Board  \
0  220 Henry Street\nNew York, NY 10002\n(40.7137...             3.0   
1  200 Monroe Street\nNew York, NY 10002\n(40.712...             3.0   
2  420 East 12 Street\nNew York, NY 10009\n(40.72...             3.0   
3  145 Stanton Street\nNew York, NY 10002\n(40.72...             3.0   
4  111 Columbia Street\nNew York, NY 10002\n(40.7...             3.0   

  Council District Census Tract        BIN           BBL  \
0              1.0        201.0  1003223.0  1.002690e+09   
1              1.0        202.0  1003214.0  1.002590e+09   
2              2.0         34.0  1005974.0  1.004390e+09   
3              1.0       3001.0  1004323.0  1.003540e+09   
4              2.0       2201.0  1004070.0  1.003350e+09   

                                                 NTA        lat        lon  
0  Lower East Side                               ...  40.713764 -73.985260  
1  Lower East Side                               ...  40.712332 -73.984797  
2  East Village                                  ...  40.729783 -73.983041  
3  Chinatown                                     ...  40.720569 -73.985673  
4  Lower East Side                               ...  40.718725 -73.979426  

[5 rows x 165 columns]

/tmp/ipykernel_397/1913468206.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  means = combined.mean()


combined['school_dist'] = combined['DBN'].apply(lambda x:x[0:2])
print(combined['school_dist'][0:5])

0    01
1    01
2    01
3    01
4    01
Name: school_dist, dtype: object


for col in combined.columns:
    print(col)

DBN
SCHOOL NAME
Num of SAT Test Takers
SAT Critical Reading Avg. Score
SAT Math Avg. Score
SAT Writing Avg. Score
sat_score
index
SchoolName
AP Test Takers 
Total Exams Taken
Number of Exams with scores 3 4 or 5
Demographic
School Name
Cohort
Total Cohort
Total Grads - n
Total Grads - % of cohort
Total Regents - n
Total Regents - % of cohort
Total Regents - % of grads
Advanced Regents - n
Advanced Regents - % of cohort
Advanced Regents - % of grads
Regents w/o Advanced - n
Regents w/o Advanced - % of cohort
Regents w/o Advanced - % of grads
Local - n
Local - % of cohort
Local - % of grads
Still Enrolled - n
Still Enrolled - % of cohort
Dropped Out - n
Dropped Out - % of cohort
CSD
NUMBER OF SECTIONS
AVERAGE CLASS SIZE
SIZE OF SMALLEST CLASS
SIZE OF LARGEST CLASS
SCHOOLWIDE PUPIL-TEACHER RATIO
Name
schoolyear
fl_percent
frl_percent
total_enrollment
prek
k
grade1
grade2
grade3
grade4
grade5
grade6
grade7
grade8
grade9
grade10
grade11
grade12
ell_num
ell_percent
sped_num
sped_percent
ctt_num
selfcontained_num
asian_num
asian_per
black_num
black_per
hispanic_num
hispanic_per
white_num
white_per
male_num
male_per
female_num
female_per
rr_s
rr_t
rr_p
N_s
N_t
N_p
saf_p_11
com_p_11
eng_p_11
aca_p_11
saf_t_11
com_t_11
eng_t_11
aca_t_11
saf_s_11
com_s_11
eng_s_11
aca_s_11
saf_tot_11
com_tot_11
eng_tot_11
aca_tot_11
dbn
school_name
borough
building_code
phone_number
fax_number
grade_span_min
grade_span_max
expgrade_span_min
expgrade_span_max
bus
subway
primary_address_line_1
city
state_code
postcode
website
total_students
campus_name
school_type
overview_paragraph
program_highlights
language_classes
advancedplacement_courses
online_ap_courses
online_language_courses
extracurricular_activities
psal_sports_boys
psal_sports_girls
psal_sports_coed
school_sports
partner_cbo
partner_hospital
partner_highered
partner_cultural
partner_nonprofit
partner_corporate
partner_financial
partner_other
addtl_info1
addtl_info2
start_time
end_time
se_services
ell_programs
school_accessibility_description
number_programs
priority01
priority02
priority03
priority04
priority05
priority06
priority07
priority08
priority09
priority10
Location 1
Community Board
Council District
Census Tract
BIN
BBL
NTA
lat
lon
school_dist


#create lists of survey fields based on respondent type (student|parent|teacher|total)
student_survey_fields = ["rr_s", 
    "N_s", 
    "saf_s_11",
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11"
]

teacher_survey_fields = ["rr_t", 
    "N_t", 
    "saf_t_11",
    "com_t_11", 
    "eng_t_11", 
    "aca_t_11"
]

parent_survey_fields = ["rr_p", 
    "N_p", 
    "saf_p_11",
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11"
]
tot_survey_fields = [ 
    "saf_tot_11",
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11"

]
#Dictionary mapping feature names to intuitive names
survey_dictionary = {}
survey_dictionary["rr_s"] = "Response Rate"
survey_dictionary["N_s"] = "Number of Repondents"
survey_dictionary["saf_s_11"] = "Safety Score"
survey_dictionary["com_s_11"] = "Communication Score"
survey_dictionary["eng_s_11"] = "Engagement Score"
survey_dictionary["aca_s_11"] = "Academic Expectations Score"
#compute correlations for plotting
correlations = combined.corr()
correlations = correlations['sat_score']
#___________________________________________________________
#initialize plot
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
fig,ax = plt.subplots(figsize = (10,6))
#___________________________________________________________
#generate main plot
x_axis = np.arange(len(student_survey_fields))
ax.barh(x_axis + .2,correlations[student_survey_fields], alpha = .5,height = .2,label = "Students")
ax.barh(x_axis,correlations[parent_survey_fields],alpha = .5,height = .2, label = "Parents" )
ax.barh(x_axis - .2,correlations[teacher_survey_fields],alpha = .5,height = .2, label = "Teachers")
#___________________________________________________________
#remove ticks and format legend
ax.grid(visible=False)
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_xticks([])
ax.set_yticks([])
ax.legend(loc = 'upper left', frameon = False, fontsize = 'small')
#___________________________________________________________
#create title and subtitle
ax.text(0,6.5, 'Students Know Best', weight = 'bold', size = 17, alpha = .75)
ax.text(0,6.1, 'Correlation of survey responses with SAT score', alpha = .85)
ax.text(0,5.8, 'grouped by repondent type (students | parents | teachers)', alpha = .85)
#___________________________________________________________
#place student correlation values, and put untuitive labels on top
for i, v in enumerate(correlations[student_survey_fields]):
    ax.text(v, i+.2, str(round(v,2)), color='black', fontsize=10, ha='left', va='center', alpha = .75)
    ax.text(0, i+.37, survey_dictionary[student_survey_fields[i]], color='black', weight = 'bold',alpha = .75, fontsize=10, ha='left', va='center')


#initialize plot
fig,ax = plt.subplots(figsize = (10,6))
#___________________________________________________________
#create main plot
ax.scatter(combined["saf_s_11"], combined["sat_score"],color = "orangered", alpha = .5)
#___________________________________________________________
#remove grid
ax.grid(visible= False, axis = 'y')
#___________________________________________________________
#title and subtitle
ax.text(4.5,2500, 'Perceived Classroom Safety Influences SAT Performance', weight = 'bold', size = 17, alpha = .75)
ax.text(4.5,2400, 'SAT score vs. safety score (from student survey responses)', alpha = .85)
#___________________________________________________________
#emphasize low maximum for unsafe schools
ax.text(6.45,1750, 'Unsafe learning', color = 'grey', ha='right', va='center')
ax.text(6.45,1700, 'conditions put a ceiling', color = 'grey', ha='right', va='center')
ax.text(6.45,1650, 'on test performance', color = 'grey', ha='right', va='center')
ax.vlines(6.5, ymin = 1600, ymax = 1800,color = 'black',linewidth = 2)
#___________________________________________________________
#emphasize higher maximum for safe schools
ax.text(8.05,2150, 'Safe classrooms', color = 'grey', ha='left', va='center')
ax.text(8.05,2100, 'allow students', color = 'grey', ha='left', va='center')
ax.text(8.05,2050, 'to thrive', color = 'grey', ha='left', va='center')
ax.vlines(8, ymin = 2000, ymax = 2200,color = 'black',linewidth = 2)
#___________________________________________________________
#horizontal lines and grey tick parameters for visual
ax.tick_params(colors = 'grey', which = 'both')
ax.set_yticklabels(['','1000','','1400','','1800', '','2200'])
ax.hlines([1000,1400,1800,2200], xmin = 5,xmax=9, linewidth = .5, color = 'grey',alpha = .5)

/tmp/ipykernel_397/1794323982.py:28: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(['','1000','','1400','','1800', '','2200'])

<matplotlib.collections.LineCollection at 0x7f1008538f10>


#___________________________________________________________
#extract desired features from combined DataFrame
race_per = ["white_per","asian_per","hispanic_per","black_per"]
race_df = combined.loc[:,race_per + ["saf_s_11"]]
#___________________________________________________________
#create labels for safety interval cateogries and intervals for averaging over
categories = ['below 5.5','5.5-6.0','6.0-6.5','6.5-7.0','7.0-7.5','7.5-8.0','above 8.0']
endpoints = [5.5,6,6.5,7,7.5,8]
#___________________________________________________________
#label each data point with appropriate category / bin 
race_df.loc[race_df['saf_s_11'] < endpoints[0],'bin'] = categories[0]
for i,item in enumerate(endpoints):
    if i < len(endpoints) - 1:
        race_df.loc[(item<=race_df["saf_s_11"]) & (race_df["saf_s_11"]<endpoints[i+1]) ,'bin'] = categories[i+1]
    else: 
        race_df.loc[race_df["saf_s_11"]>item, 'bin'] = categories[i+1]
#___________________________________________________________
#generate DataFrame for plotting 
#resulting dataframe contains category as its index, race as 
# its features, and mean percentage of race j in bin i as its 
#(i,j)-th entry
plot_data = pd.DataFrame([], columns=race_per, index=range(0,len(categories)))
for bin,cat in enumerate(categories):
    for race in race_per:
        plot_data.loc[bin,race] = np.round(race_df.loc[race_df.loc[:,'bin'] == cat, race].mean(),2) 
#add a feature for unaccounted races
plot_data['other_per'] = 100.00 - plot_data.sum(axis = 1)
#___________________________________________________________
#initialize plot
fig,ax = plt.subplots(figsize = (10,6))
x_axis = categories
#___________________________________________________________
#stack bars ontop of each other to create proprtional plot
for i,race in enumerate(race_per + ['other_per']):
    if i == 0:
         ax.barh(x_axis, plot_data[race], alpha = .75,label = race[:-4])
    else:
        #cumsum used here to make bars stack ontop of each other
        ax.barh(x_axis, plot_data[race], left = plot_data.cumsum(axis = 1).iloc[:,i-1],alpha = .75,label = race[:-4])
#___________________________________________________________
#remove grids and make ticks and labels grey, format legend
ax.tick_params(colors = 'grey', which = 'both')
ax.grid(visible= False, axis = 'y')
ax.grid(visible= False, axis = 'x')
leg = ax.legend(bbox_to_anchor=([0.25, .95, 0, 0]), ncol=5, frameon=False, fontsize = 'small')
#___________________________________________________________
#place title and subtitle
for text in leg.get_texts():
    text.set_color('gray')
ax.text(-15,7.9, 'Safety Ceiling Disproportionately Affects Black Students', weight = 'bold', size = 17, alpha = .75)
ax.text(-15,7.5, 'Racial demographics grouped by student perceived safety score', alpha = .85)

Text(-15, 7.5, 'Racial demographics grouped by student perceived safety score')


#_______________________________________________________
#extract desired features
race_per = ["white_per","asian_per","hispanic_per","black_per"]
race_df = combined.loc[:,race_per + ["sat_score"]]
#_______________________________________________________
#create category names and endpoints for averaging intervals
categories = ['below 1000','1000-1050','1050-1100','1100-1150','1150-1200',
'1200-1250','1250-1300','1300-1350','1350-1400','1400-1450','1450-1500',
'1500-1550','1550-1600','above 1600']
endpoints = [1000,1050,1100,1150,1200,1250,1300,1350,1400,1450,1500,1550,1600]
#_______________________________________________________
#properly label each entry with category 
race_df.loc[race_df['sat_score'] < endpoints[0],'bin'] = categories[0]
for i,item in enumerate(endpoints):
    if i < len(endpoints) - 1:
        race_df.loc[(item<=race_df["sat_score"]) & (race_df["sat_score"]<endpoints[i+1]) ,'bin'] = categories[i+1]
    else: 
        race_df.loc[race_df["sat_score"]>item, 'bin'] = categories[i+1]

#_______________________________________________________
#generate data for plotting, as in previous code cell
plot_data = pd.DataFrame([], columns=race_per, index=range(0,len(categories)))
for bin,cat in enumerate(categories):
    for race in race_per:
        plot_data.loc[bin,race] = np.round(race_df.loc[race_df.loc[:,'bin'] == cat, race].mean(),2) 
plot_data['other_per'] = 100.00 - plot_data.sum(axis = 1)
#_______________________________________________________
#initialize plot
x_axis = categories
fig,ax = plt.subplots(figsize = (10,6))
#_______________________________________________________
#create plot, stacking bars ontop of each other
for i,race in enumerate(race_per + ['other_per']):
    if i == 0:
         ax.barh(x_axis, plot_data[race], alpha = .75,label = race[:-4])
    else:
        ax.barh(x_axis, plot_data[race], left = plot_data.cumsum(axis = 1).iloc[:,i-1],alpha = .75,label = race[:-4])
#_______________________________________________________
#remove grid, format tick colors and labels, format legend
ax.tick_params(colors = 'grey', which = 'both')
ax.set_xticklabels(['0%','20%','40%','60%','80%','100%'])
ax.grid(visible= False, axis = 'y')
ax.grid(visible= False, axis = 'x')
leg = ax.legend(bbox_to_anchor=([0.25, .95, 0, 0]), ncol=5, frameon=False, fontsize = 'small')
#_______________________________________________________
#place title and subtitle
for text in leg.get_texts():
    text.set_color('gray')
ax.text(-15,16.5, 'Sharp Race Disparity Between High and Low Performing Schools', weight = 'bold', size = 17, alpha = .75)
ax.text(-15,15.75, 'Racial demographics grouped by SAT Score', alpha = .85)

/tmp/ipykernel_397/2974186592.py:31: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(['0%','20%','40%','60%','80%','100%'])

Text(-15, 15.75, 'Racial demographics grouped by SAT Score')


#_______________________________________________________
#initialize plot
fig,ax = plt.subplots(figsize = (10,6))
#___________________________________________________________
#create descired scatter plot
ax.scatter(combined["ell_percent"], combined["sat_score"],color = "teal", alpha = .5)
#___________________________________________________________
#remove grids
ax.grid(visible= False, axis = 'y')
#___________________________________________________________
#place title and subtitle
ax.text(-10,2600, 'Language Barrier Offers Partial Explanation for Race Disparity', weight = 'bold', size = 17, alpha = .75)
ax.text(-10,2520, 'in Test Performance', weight = 'bold', size = 17, alpha = .75)
ax.text(-10,2450, 'SAT score vs. English language learner percentage', alpha = .85)
#_______________________________________________________
#highlight higher performance only when ell_percent is small
ax.text(39.5,1950, 'Performance improves', color = 'grey', ha='right', va='center')
ax.text(39.5,1900, 'only when >90% of students', color = 'grey', ha='right', va='center')
ax.text(39.5,1850, 'possess English fluency', color = 'grey', ha='right', va='center')
ax.vlines(40, ymin = 1800, ymax = 2000,color = 'black',linewidth = 2)
#_______________________________________________________
#emphasize lower maximum performance when ell_percent is large
ax.text(60.5,1450, 'Predominantly English', color = 'grey', ha='left', va='center')
ax.text(60.5,1400, 'language learning', color = 'grey', ha='left', va='center')
ax.text(60.5,1350, 'classrooms struggle', color = 'grey', ha='left', va='center')
ax.vlines(60, ymin = 1300, ymax = 1500,color = 'black',linewidth = 2)
#_______________________________________________________
#fix tick color, labels, and add horizontal lines in proper places
ax.tick_params(colors = 'grey', which = 'both')
ax.set_yticklabels(['','1000','','1400','','1800', '','2200'])
ax.hlines([1000,1400,1800,2200], xmin = 0,xmax=100, linewidth = .5, color = 'grey',alpha = .5)

/tmp/ipykernel_397/76450323.py:30: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(['','1000','','1400','','1800', '','2200'])

<matplotlib.collections.LineCollection at 0x7f10084cb130>

	lat	lon
0	40.601989	-73.762834
1	40.593594	-73.984729
2	40.692134	-73.931503
3	40.822304	-73.855961
4	40.773671	-73.985269
...	...	...
430	40.765027	-73.992517
431	40.840373	-73.910838
432	40.696962	-73.910816
433	40.732485	-73.983053
434	40.582116	-73.972894

Introduction¶

About the Data¶

Data Exploration and Cleaning¶

Reading in .csv files¶

Reading in .txt files¶

Creating DBN¶

SAT results: total score¶

Extracting school coordinates¶

Ensuring unique DBN¶

class_size DBNs¶

demographics DBNs¶

graduation DBNs¶

ap_2010¶

Combining The Data¶

Data Analyzation¶

Conslusion¶

	CSD	BOROUGH	SCHOOL CODE	SCHOOL NAME	GRADE	PROGRAM TYPE	CORE SUBJECT (MS CORE and 9-12 ONLY)	CORE COURSE (MS CORE and 9-12 ONLY)	SERVICE CATEGORY(K-9* ONLY)	NUMBER OF STUDENTS / SEATS FILLED	NUMBER OF SECTIONS	AVERAGE CLASS SIZE	SIZE OF SMALLEST CLASS	SIZE OF LARGEST CLASS	DATA SOURCE	SCHOOLWIDE PUPIL-TEACHER RATIO
0	1	M	M015	P.S. 015 Roberto Clemente	0K	GEN ED	-	-	-	19	1.0	19.0	19.0	19.0	ATS	NaN
1	1	M	M015	P.S. 015 Roberto Clemente	0K	CTT	-	-	-	21	1.0	21.0	21.0	21.0	ATS	NaN
2	1	M	M015	P.S. 015 Roberto Clemente	01	GEN ED	-	-	-	17	1.0	17.0	17.0	17.0	ATS	NaN
3	1	M	M015	P.S. 015 Roberto Clemente	01	CTT	-	-	-	17	1.0	17.0	17.0	17.0	ATS	NaN
4	1	M	M015	P.S. 015 Roberto Clemente	02	GEN ED	-	-	-	15	1.0	15.0	15.0	15.0	ATS	NaN

	DBN	SCHOOL NAME	Num of SAT Test Takers	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score
0	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES	29	355	404	363
1	01M448	UNIVERSITY NEIGHBORHOOD HIGH SCHOOL	91	383	423	366
2	01M450	EAST SIDE COMMUNITY SCHOOL	70	377	402	370
3	01M458	FORSYTH SATELLITE ACADEMY	7	414	401	359
4	01M509	MARTA VALLE HIGH SCHOOL	44	390	433	384

	DBN	Name	schoolyear	fl_percent	frl_percent	total_enrollment	prek	k	grade1	grade2	...	black_num	black_per	hispanic_num	hispanic_per	white_num	white_per	male_num	male_per	female_num	female_per
0	01M015	P.S. 015 ROBERTO CLEMENTE	20052006	89.4	NaN	281	15	36	40	33	...	74	26.3	189	67.3	5	1.8	158.0	56.2	123.0	43.8
1	01M015	P.S. 015 ROBERTO CLEMENTE	20062007	89.4	NaN	243	15	29	39	38	...	68	28.0	153	63.0	4	1.6	140.0	57.6	103.0	42.4
2	01M015	P.S. 015 ROBERTO CLEMENTE	20072008	89.4	NaN	261	18	43	39	36	...	77	29.5	157	60.2	7	2.7	143.0	54.8	118.0	45.2
3	01M015	P.S. 015 ROBERTO CLEMENTE	20082009	89.4	NaN	252	17	37	44	32	...	75	29.8	149	59.1	7	2.8	149.0	59.1	103.0	40.9
4	01M015	P.S. 015 ROBERTO CLEMENTE	20092010		96.5	208	16	40	28	32	...	67	32.2	118	56.7	6	2.9	124.0	59.6	84.0	40.4

	Demographic	DBN	School Name	Cohort	Total Cohort	Total Grads - n	Total Grads - % of cohort	Total Regents - n	Total Regents - % of cohort	Total Regents - % of grads	...	Regents w/o Advanced - n	Regents w/o Advanced - % of cohort	Regents w/o Advanced - % of grads	Local - n	Local - % of cohort	Local - % of grads	Still Enrolled - n	Still Enrolled - % of cohort	Dropped Out - n	Dropped Out - % of cohort
0	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2003	5	s	NaN	s	NaN	NaN	...	s	NaN	NaN	s	NaN	NaN	s	NaN	s	NaN
1	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2004	55	37	67.3	17	30.9	45.9	...	17	30.9	45.9	20	36.4	54.1	15	27.3	3	5.5
2	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2005	64	43	67.2	27	42.2	62.8	...	27	42.2	62.8	16	25.0	37.2	9	14.1	9	14.1
3	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2006	78	43	55.1	36	46.2	83.7	...	36	46.2	83.7	7	9.0	16.3	16	20.5	11	14.1
4	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2006 Aug	78	44	56.4	37	47.4	84.1	...	37	47.4	84.1	7	9.0	15.9	15	19.2	11	14.1

	DBN	SchoolName	AP Test Takers	Total Exams Taken	Number of Exams with scores 3 4 or 5
0	01M448	UNIVERSITY NEIGHBORHOOD H.S.	39	49	10
1	01M450	EAST SIDE COMMUNITY HS	19	21	NaN
2	01M515	LOWER EASTSIDE PREP	24	26	24
3	01M539	NEW EXPLORATIONS SCI,TECH,MATH	255	377	191
4	02M296	High School of Hospitality Management	NaN	NaN	NaN

	DBN	SchoolName	AP Test Takers	Total Exams Taken	Number of Exams with scores 3 4 or 5
51	04M610	THE YOUNG WOMEN'S LEADERSHIP SCHOOL OF EAST HA...	41.0	55.0	29.0
52	04M610	YOUNG WOMEN'S LEADERSHIP SCH	NaN	NaN	NaN