nhanes/ XXXXXXXXXX/Demographics/.ipynb_checkpoints/Scrape-checkpoint.ipynb { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n",...

all questions of assignment are in the lab book


nhanes/2015-2016/Demographics/.ipynb_checkpoints/Scrape-checkpoint.ipynb { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import urllib\n", "import time" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_parent_page():\n", " link = \"https://wwwn.cdc.gov/nchs/nhanes/Search/DataPage.aspx?Component=Demographics\"\n", " response = urllib.request.urlopen(link)\n", " html = response.read().decode()\n", " lines = html.strip().split(\"\\n\")\n", " return lines" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "parent_lines = get_parent_page()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Define More Functions\n", "\n", "\n", "def get_new_link(filename , lines=parent_lines):\n", " sublines = [l for l in lines if filename in l]\n", " docline = [l for l in sublines if '.htm' in l]\n", " splitstr = docline[0].split('\"')\n", " if len(splitstr)==3:\n", " newlink = splitstr[1]\n", " else:\n", " newlink = None\n", " newlink = 'https://wwwn.cdc.gov/' + newlink\n", " return newlink\n", "\n", "\n", "\n", "def get_file_page(link):\n", " response = urllib.request.urlopen(link)\n", " html = response.read().decode()\n", " lines = html.strip().split(\"\\n\")\n", " return lines\n", "\n", "\n", "\n", "def make_dataframe(lines):\n", " \n", " def find_lines_with(s,lines=lines):\n", " mylines = [s in l for l in lines]\n", " idx = [i for i, x in enumerate(mylines) if x]\n", " return idx\n", " \n", " vnid = find_lines_with('Variable Name:')\n", " sasid = find_lines_with('SAS Label:')\n", " etid = find_lines_with('English Text:')\n", " eiid = find_lines_with('English Instructions:')\n", " tid = find_lines_with('Target:')\n", " \n", " def fill_missing(thisid,vnid):\n", " vnb4 = [[vn')[1]\n", " strsplit2 = strsplit1.split('<') [0]\n",="" "="" strs[i]="strsplit2.replace('\\t','')\n"," "="" if="" '\\r'="" in="" strs[i]:\n",="" "="" extraline=""><')[0]\n", " extraline = extraline.replace('\\t','')\n", " extraline = extraline.replace('\\r','')\n", " strs[i] = strs[i].replace('\\r','')\n", " strs[i] += extraline\n", " return strs\n", " \n", " vnstr = pull_out_string(vnid,lines)\n", " sasstr = pull_out_string(sasid,lines)\n", " etstr = pull_out_string(etid,lines)\n", " eistr = pull_out_string(eiid,lines)\n", " tstr = pull_out_string(tid,lines)\n", " \n", " df = pd.dataframe({'variable name':vnstr , 'sas label':sasstr , 'english text':etstr , 'target':tstr , 'english instructions':eistr})\n", " return df\n", "\n", "\n", "\n", "def have_manners(wait_for_secs):\n", " time.sleep(wait_for_secs)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dr1tot_i.csv (1/2)\n", "dsqtot_i.csv (2/2)\n" ] } ], "source": [ "="" extraline="extraline.replace('\\t','')\n"," "="" extraline="extraline.replace('\\r','')\n"," "="" strs[i]="strs[i].replace('\\r','')\n"," "="" strs[i]="" +="extraline\n"," "="" return="" strs\n",="" "="" \n",="" "="" vnstr="pull_out_string(vnid,lines)\n"," "="" sasstr="pull_out_string(sasid,lines)\n"," "="" etstr="pull_out_string(etid,lines)\n"," "="" eistr="pull_out_string(eiid,lines)\n"," "="" tstr="pull_out_string(tid,lines)\n"," "="" \n",="" "="" df="pd.DataFrame({'Variable" name':vnstr="" ,="" 'sas="" label':sasstr="" ,="" 'english="" text':etstr="" ,="" 'target':tstr="" ,="" 'english="" instructions':eistr})\n",="" "="" return="" df\n",="" "\n",="" "\n",="" "\n",="" "def="" have_manners(wait_for_secs):\n",="" "="" time.sleep(wait_for_secs)"="" ]="" },="" {="" "cell_type":="" "code",="" "execution_count":="" 5,="" "metadata":="" {},="" "outputs":="" [="" {="" "name":="" "stdout",="" "output_type":="" "stream",="" "text":="" [="" "dr1tot_i.csv="" (1/2)\n",="" "dsqtot_i.csv="" (2/2)\n"="" ]="" }="" ],="" "source":="">
Nov 12, 2021
SOLUTION.PDF

Get Answer To This Question

Related Questions & Answers

More Questions »

Submit New Assignment

Copy and Paste Your Assignment Here