Understanding indicators used to measure health quality

Justetc Social Services (non-profit)

Justetc Social Services (non-profit)Jan 31 · 16 min read

This code works with the data on the excel file: indicator-methodology.xls

Purpose:

  1. Find out prominant indicators — this might also mean the critical aspect of health
  2. Find out the measurements that are used to find the quality

Code Reference: This code heavily makes use of the code provided on the Text Visualization Lab. The methods might have been used as is i.e used as libraries

# COMMENT IF PACKAGES ALREADY INSTALLED (if pip does not work use pip3)
# !pip install nltk
# !pip install wordcloud
# !pip install pytagcloud
# !pip install pygame
# !pip install simplejson
# !pip install bs4
# !pip install networkx
# !pip install gensimimport nltk
#nltkownload() ##choose stopwords to download
from nltk.corpus import stopwords
from nltk.util import ngrams
from bs4 import BeautifulSoup
# import urllib2
import urllib.request as urllib2
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
from pytagcloud import create_tag_image, make_tags, LAYOUT_MIX
import operator
from IPython.display import Image
import nltk.data
import networkx as nx
import sys
sys.setrecursionlimit(10000)
nltk.download('stopwords')
nltk.download('punkt')
# nltk.download() ## download stopwords and punkt

import networkx as nx
import randompygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


[nltk_data] Downloading package stopwords to C:\Users\Sayed
[nltk_data] Ahmed\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sayed
[nltk_data] Ahmed\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!# https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
def clean_text( raw_review ):
# Function to convert a raw review to a string of words
# The input is a single string (a raw movie review), and
# the output is a single string (a preprocessed movie review)
#
# 1. Remove HTML
review_text = BeautifulSoup(raw_review)
#Remove javascript elements
for script in review_text(["script", "style"]):
script.extract() # rip it out

# get text
review_text = review_text.get_text()
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("english"))
#
# 5. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))


def constructtree(lst,tree,factor, parent=None):
if lst==[]:
return {}
else:
word =lst[0]
if parent:
edges.append((parent, word))
else:
edges.append(('root', word))
if not word in tree.keys(): #tree.has_key(word): for python 2
tree[word]={'name':word,'value':1/factor,"children":{}}
tree[word]["children"]=constructtree(lst[1:],tree[word]["children"],factor, word)
else:
#print 22
tree[word]["value"]+=1/factor
tree[word]["children"]=constructtree(lst[1:],tree[word]["children"],factor, word)
return tree

def doall(wl,tree):
for x in wl:
# print (11,x) #
# print()
tree2=constructtree(x,tree,1)
tree=tree2
# print (1,tree) #
# print()
return tree, edges



def hierarchy_pos(G, root=None, width=10., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5):

'''
From Joel's answer at https://stackoverflow.com/a/29597209/2966723

If the graph is a tree this will return the positions to plot this in a
hierarchical layout.

G: the graph (must be a tree)

root: the root node of current branch
- if the tree is directed and this is not given, the root will be found and used
- if the tree is directed and this is given, then the positions will be just for the descendants of this node.
- if the tree is undirected and not given, then a random choice will be used.

width: horizontal space allocated for this branch - avoids overlap with other branches

vert_gap: gap between levels of hierarchy

vert_loc: vertical location of root

xcenter: horizontal location of root
'''
if not nx.is_tree(G):
raise TypeError('cannot use hierarchy_pos on a graph that is not a tree')

if root is None:
if isinstance(G, nx.DiGraph):
root = next(iter(nx.topological_sort(G))) #allows back compatibility with nx version 1.11
else:
root = random.choice(list(G.nodes))

def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None):
'''
see hierarchy_pos docstring for most arguments

pos: a dict saying where all nodes go if they have been assigned
parent: parent of this branch. - only affects it if non-directed

'''

if pos is None:
pos = {root:(xcenter,vert_loc)}
else:
pos[root] = (xcenter, vert_loc)
children = list(G.neighbors(root))
if not isinstance(G, nx.DiGraph) and parent is not None:
children.remove(parent)
if len(children)!=0:
dx = width/len(children)
nextx = xcenter - width/2 - dx/2
for child in children:
nextx += dx
pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap,
vert_loc = vert_loc-vert_gap, xcenter=nextx,
pos=pos, parent = root)
return pos


return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter)

Fetch Indicator Data i.e. indicator methodology

This will give an idea what are the primary indicators used to measure healthcare quality

import pandas as pdindicator_methodology = pd.read_excel('../data/indicator-methodology.xls')
indicator_methodology.columns
indicator_methodology.head()
png
indicator_methodology['Indicator definition'][:5]0    Number of deaths due to cancer per 100,000 fem...
1 Number of deaths due to cancer per 100,000 males.
2 Number of deaths due to ischemic heart disease...
3 Number of deaths due to cerebrovascular diseas...
4 Number of deaths due to transport accidents \n...
Name: Indicator definition, dtype: object# get all indicators in a raw text variable
raw_indicator = ''
raw_measure = ''

for aRow in range(indicator_methodology.shape[0]):
raw_indicator += ' ' + indicator_methodology['Indicator label'][aRow] + str('')
raw_measure += ' ' + str(indicator_methodology['Indicator definition'][aRow])

#raw = pd.merge(indicator_methodology['Indicator label'], indicator_methodology['Indicator definition'])

raw_indicator[:100], raw_measure[:100]
raw_measure' Number of deaths due to cancer per 100,000 females. Number of deaths due to cancer per 100,000 males. Number of deaths due to ischemic heart disease \nper 100,000 population.Number of deaths due to ischemic heart disease \nper 100,000 population.Number of deaths due to ischemic heart disease \nper 100,000 population.Number of deaths due to ischemic heart disease \nper 100,000 population. Number of deaths due to cerebrovascular diseases \nper 100,000 population.Number of deaths due to cerebrovascular diseases \nper 100,000 population.Number of deaths due to cerebrovascular diseases \nper 100,000 population.Number of deaths due to cerebrovascular diseases \nper 100,000 population. Number of deaths due to transport accidents \nper 100,000 females.Number of deaths due to transport accidents \nper 100,000 females.Number of deaths due to transport accidents \nper 100,000 females.Number of deaths due to transport accidents \nper 100,000 females. Number of deaths due to transport accidents \nper 100,000 males.Number of deaths due to transport accidents \nper 100,000 males.Number of deaths due to transport accidents \nper 100,000 males.Number of deaths due to transport accidents \nper 100,000 males. Number of deaths due to suicide per 100,000 females. Number of deaths due to suicide per 100,000 males. Deaths of children younger than 1 year per 1,000 live births. Percentage of the population age 15+ who report their health to be “good” or better. Average number of years that a female can be expected to live, assuming that age-specific mortality levels remain constant. Average number of years that a male can be expected to live, assuming that age-specific mortality levels remain constant. Percentage of the population age 15+ who report eating fruit at least once per day. Percentage of the population age 15+ who report eating vegetables at least once per day. Percentage of the female population age 15+ who report that they are daily smokers. Percentage of the male population age 15+ who report that they are daily smokers. Average annual alcohol consumption in litres per capita \n(age 15+).Average annual alcohol consumption in litres per capita \n(age 15+).Average annual alcohol consumption in litres per capita \n(age 15+). Percentage of adults who are obese (body mass index higher than 30 kg/m²), self-report. Median wait time (in days) from specialist assessment \n(booking date) to cataract surgery.Median wait time (in days) from specialist assessment \n(booking date) to cataract surgery.Median wait time (in days) from specialist assessment \n(booking date) to cataract surgery. Median wait time (in days) from specialist assessment \n(booking date) to hip replacement.Median wait time (in days) from specialist assessment \n(booking date) to hip replacement.Median wait time (in days) from specialist assessment \n(booking date) to hip replacement. Median wait time (in days) from specialist assessment \n(booking date) to knee replacement.Median wait time (in days) from specialist assessment \n(booking date) to knee replacement.Median wait time (in days) from specialist assessment \n(booking date) to knee replacement. Percentage of people able to get an appointment to see a doctor or a nurse on the same or next day last time they were sick or needed medical attention. Percentage of people who needed care after hours and reported difficulty getting medical care in the evenings, on weekends or on holidays without going to the hospital emergency department/emergency room. Percentage of adults who waited for 4 weeks or more after they were advised to see or decided to see a specialist. Percentage of people who had a medical problem but did not consult/visit a doctor because of the cost. Percentage of people with a regular doctor or place of care. Percentage of adults age 65+ who received an influenza vaccination within the past year. Number of hospital discharges for COPD of people age 15 and older per 100,000 population. Number of hospital discharges for asthma of people age 15 and older per 100,000 population. Number of hospital discharges for diabetes of people age 15 and older per 100,000 population. Percentage of adults who report that their regular doctor always or often spent enough time with them. Percentage of adults who report that their regular doctor always or often explains things in a way that is easy to\xa0understand. Percentage of older adults (age 55+) who report that their regular doctor always or often gave them an opportunity to ask questions or raise concerns. Percentage of adults who report that their regular doctor always or often involved them as much as they wanted in decisions about their care and treatment. 5-year relative survival rate for breast cancer. Number of deaths due to breast cancer, per 100,000 females. 5-year relative survival rate for cervical cancer. Number of deaths due to cervical cancer, per 100,000 females. 5-year relative survival rate for colorectal cancer. Number of deaths due to colorectal cancer, \nper 100,000 population.Number of deaths due to colorectal cancer, \nper 100,000 population. Percentage of patients (age 45+) who die in hospital within 30 days of being admitted with a primary diagnosis of acute myocardial infarction (AMI). Percentage of patients (age 45+) who die in hospital within 30 days of being admitted with a primary diagnosis of \nischemic stroke.Percentage of patients (age 45+) who die in hospital within 30 days of being admitted with a primary diagnosis of \nischemic stroke. Rate of a foreign body left inside the patient’s body during a procedure, per 100,000 hospital discharges (age 15+). Rate of post-operative pulmonary embolism, per 100,000 discharges for hip and knee replacement (age 15+). Rate of post-operative sepsis, per 100,000 discharges for abdominal surgery (age 15+). Percentage of vaginal deliveries with third- or fourth-degree obstetric trauma, per 100 instrument-assisted vaginal deliveries. Percentage of vaginal deliveries with third- or fourth-degree obstetric trauma, per 100 vaginal deliveries without instrument\xa0assistance. Percentage of patients with diabetes with prescription of first-choice antihypertensive medication. Number per 1,000 patients age 65+ with prescriptions of more than 365 daily doses of benzodiazepines or related drugs. Number per 1,000 patients age 65+ with at least one prescription of long-acting benzodiazepines or related drugs. Total volume of antibiotics prescribed for systemic use, in defined daily doses per 1,000 population per day. Volume of second-line antibiotics as a percentage of all antibiotics prescribed. nan nan nan'

Clean data

text = clean_text(raw_indicator)

# I wrote similar code as part of NLP assignments
# remove punctuations
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w]+")
text = tokenizer.tokenize(text)

text


# get the list of stop words
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
print ('stop word count', len(stops) )

# just to show a partial list of stop words
stop_l = list(stops)
stop_l[:5]


# remove stopwords
text_list = [] #text.split(" ")
for aTok in text:
if aTok not in stops:
text_list.append(aTok)

text_liststop word count 179





['cancer',
'mortality',
'f',
'cancer',
'mortality',
'heart',
'disease',
'mortality',
'stroke',
'mortality',
'transport',
'accident',
'mortality',
'f',
'transport',
'accident',
'mortality',
'suicide',
'f',
'suicide',
'infant',
'mortality',
'perceived',
'health',
'status',
'life',
'expectancy',
'birth',
'f',
'life',
'expectancy',
'birth',
'fruit',
'consumption',
'adults',
'vegetable',
'consumption',
'adults',
'smoking',
'adults',
'f',
'smoking',
'adults',
'alcohol',
'consumption',
'adults',
'obesity',
'reported',
'adults',
'wait',
'time',
'cataract',
'surgery',
'wait',
'time',
'hip',
'replacement',
'wait',
'time',
'knee',
'replacement',
'next',
'day',
'appt',
'poor',
'weekend',
'evening',
'care',
'wait',
'time',
'specialist',
'inability',
'pay',
'medical',
'bills',
'regular',
'doctor',
'influenza',
'vaccination',
'avoidable',
'admissions',
'copd',
'avoidable',
'admissions',
'asthma',
'avoidable',
'admissions',
'diabetes',
'time',
'spent',
'doctor',
'easy',
'understand',
'doctor',
'know',
'important',
'medical',
'history',
'involvement',
'decisions',
'breast',
'cancer',
'survival',
'breast',
'cancer',
'mortality',
'cervical',
'cancer',
'survival',
'cervical',
'cancer',
'mortality',
'colorectal',
'cancer',
'survival',
'colorectal',
'cancer',
'mortality',
'day',
'hospital',
'fatality',
'ami',
'day',
'hospital',
'fatality',
'ischemic',
'stroke',
'foreign',
'body',
'left',
'post',
'op',
'pe',
'hip',
'knee',
'post',
'op',
'sepsis',
'abdominal',
'ob',
'trauma',
'instrument',
'ob',
'trauma',
'instrument',
'diabetes',
'high',
'blood',
'pressure',
'medication',
'benzodiazepines',
'chronic',
'use',
'benzodiazepines',
'long',
'acting',
'antibiotics',
'total',
'volume',
'systemic',
'use',
'antibiotics',
'proportion',
'second',
'line',
'notes',
'oecd',
'organisation',
'economic',
'co',
'operation',
'development',
'province',
'determined',
'patient',
'residence',
'indicators',
'except',
'patient',
'safety',
'dimension',
'calculated',
'facility',
'province']# find all unigrams


unigrams = ngrams(text_list, 1) # resulting object is an iterator
# bigrams = ngrams(text_list, 2) #
unigrams = list(ngrams(text_list, 1)) # resulting object is an iterator

#for uni in unigrams: #
#print(uni); #

freq = Counter(unigrams)
#print(freq) #

topN = freq.most_common()[1:20] #top frequent 20 words
#print(topN) #
wordscount = {w[0]:f for w, f in topN}
sorted_wordscount = sorted(wordscount.items(), key=operator.itemgetter(1),reverse=True)
#print(sorted_wordscount) #


## use pytag package
create_tag_image(make_tags(sorted_wordscount[:],maxsize=40), 'filename.png', size=(250,200), background=(0, 0, 0, 255), layout=LAYOUT_MIX, fontname='Molengo', rectangular=True)
Image("filename.png")
png
text_again = ''
for aWord in text_list:
if len(aWord) > 2:
text_again += ' ' + aWord

text_again' cancer mortality cancer mortality heart disease mortality stroke mortality transport accident mortality transport accident mortality suicide suicide infant mortality perceived health status life expectancy birth life expectancy birth fruit consumption adults vegetable consumption adults smoking adults smoking adults alcohol consumption adults obesity reported adults wait time cataract surgery wait time hip replacement wait time knee replacement next day appt poor weekend evening care wait time specialist inability pay medical bills regular doctor influenza vaccination avoidable admissions copd avoidable admissions asthma avoidable admissions diabetes time spent doctor easy understand doctor know important medical history involvement decisions breast cancer survival breast cancer mortality cervical cancer survival cervical cancer mortality colorectal cancer survival colorectal cancer mortality day hospital fatality ami day hospital fatality ischemic stroke foreign body left post hip knee post sepsis abdominal trauma instrument trauma instrument diabetes high blood pressure medication benzodiazepines chronic use benzodiazepines long acting antibiotics total volume systemic use antibiotics proportion second line notes oecd organisation economic operation development province determined patient residence indicators except patient safety dimension calculated facility province'## using wordcloud package
wordcloud = WordCloud(max_font_size=40).generate(text_again)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## use custom scoring
wordscount = {w[0]:f for w, f in topN}

wordcloud = WordCloud(max_font_size=40)
wordcloud.fit_words(wordscount)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
png
png

Word Association

# split text into sentences
# each sentence is a "market basket"
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#review_text = BeautifulSoup(raw_indicator)
review_text = BeautifulSoup(raw_indicator)
for script in review_text(["script", "style"]):
script.extract() # rip it out
# get text
review_text = review_text.get_text()
sentences = tokenizer.tokenize(review_text)


##naive implementation
word_association = Counter()
for sent in sentences:
bigrams = Counter(ngrams([w for w in sent.lower().split(" ") if not w in stopwords.words('english')], 2))
word_association.update(bigrams)

topN = word_association.most_common()[1:20]

G = nx.Graph()
for edge in topN:
G.add_edge(edge[0][0], edge[0][1], weight=edge[1])
pos=nx.circular_layout(G)

plt.figure(3,figsize=(6,6))
nx.draw(G, pos, with_labels = True, font_size=10, edge_color='blue', node_color='white', font_weight='bold')
plt.show()

# http://stackoverflow.com/questions/13429094/implementing-a-word-tree-using-nested-dictionaries-in-python
edges = []



word_sents_arrays = []
for sent in sentences[1:5]:
word_sents_arrays.append(clean_text(sent).split())
print(sentences[1:5])


mnn, edges= doall(word_sents_arrays,{})




G=nx.Graph()
G.add_edges_from(edges)
G1 = nx.Graph(nx.minimum_spanning_edges(G))
pos = hierarchy_pos(G1,'root')

plt.figure(3,figsize=(10,10))
nx.draw(G1, pos=pos, with_labels=True, edge_color='blue', node_size=30, font_size=10, node_color='white')
plt.show()
png
['Province is determined by patient residence for all indicators except those in the patient safety dimension, which are calculated by facility province.']
png

Understanding what are measured

text = clean_text(raw_measure)

# I wrote similar code as part of NLP assignments
# remove punctuations
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w]+")
text = tokenizer.tokenize(text)

text


# get the list of stop words
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
print ('stop word count', len(stops) )

# just to show a partial list of stop words
stop_l = list(stops)
stop_l[:5]


# remove stopwords
text_list = [] #text.split(" ")
for aTok in text:
if aTok not in stops:
text_list.append(aTok)

text_liststop word count 179





['number',
'deaths',
'due',
'cancer',
'per',
'females',
'number',
'deaths',
'due',
'cancer',
'per',
'males',
'number',
'deaths',
'due',
'ischemic',
'heart',
'disease',
'per',
'population',
'number',
'deaths',
'due',
'ischemic',
'heart',
'disease',
'per',
'population',
'number',
'deaths',
'due',
'ischemic',
'heart',
'disease',
'per',
'population',
'number',
'deaths',
'due',
'ischemic',
'heart',
'disease',
'per',
'population',
'number',
'deaths',
'due',
'cerebrovascular',
'diseases',
'per',
'population',
'number',
'deaths',
'due',
'cerebrovascular',
'diseases',
'per',
'population',
'number',
'deaths',
'due',
'cerebrovascular',
'diseases',
'per',
'population',
'number',
'deaths',
'due',
'cerebrovascular',
'diseases',
'per',
'population',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'females',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'females',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'females',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'females',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'males',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'males',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'males',
'number',
'deaths',
'due',
'transport',
'accidents',
'per',
'males',
'number',
'deaths',
'due',
'suicide',
'per',
'females',
'number',
'deaths',
'due',
'suicide',
'per',
'males',
'deaths',
'children',
'younger',
'year',
'per',
'live',
'births',
'percentage',
'population',
'age',
'report',
'health',
'good',
'better',
'average',
'number',
'years',
'female',
'expected',
'live',
'assuming',
'age',
'specific',
'mortality',
'levels',
'remain',
'constant',
'average',
'number',
'years',
'male',
'expected',
'live',
'assuming',
'age',
'specific',
'mortality',
'levels',
'remain',
'constant',
'percentage',
'population',
'age',
'report',
'eating',
'fruit',
'least',
'per',
'day',
'percentage',
'population',
'age',
'report',
'eating',
'vegetables',
'least',
'per',
'day',
'percentage',
'female',
'population',
'age',
'report',
'daily',
'smokers',
'percentage',
'male',
'population',
'age',
'report',
'daily',
'smokers',
'average',
'annual',
'alcohol',
'consumption',
'litres',
'per',
'capita',
'age',
'average',
'annual',
'alcohol',
'consumption',
'litres',
'per',
'capita',
'age',
'average',
'annual',
'alcohol',
'consumption',
'litres',
'per',
'capita',
'age',
'percentage',
'adults',
'obese',
'body',
'mass',
'index',
'higher',
'kg',
'self',
'report',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'cataract',
'surgery',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'cataract',
'surgery',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'cataract',
'surgery',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'hip',
'replacement',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'hip',
'replacement',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'hip',
'replacement',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'knee',
'replacement',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'knee',
'replacement',
'median',
'wait',
'time',
'days',
'specialist',
'assessment',
'booking',
'date',
'knee',
'replacement',
'percentage',
'people',
'able',
'get',
'appointment',
'see',
'doctor',
'nurse',
'next',
'day',
'last',
'time',
'sick',
'needed',
'medical',
'attention',
'percentage',
'people',
'needed',
'care',
'hours',
'reported',
'difficulty',
'getting',
'medical',
'care',
'evenings',
'weekends',
'holidays',
'without',
'going',
'hospital',
'emergency',
'department',
'emergency',
'room',
'percentage',
'adults',
'waited',
'weeks',
'advised',
'see',
'decided',
'see',
'specialist',
'percentage',
'people',
'medical',
'problem',
'consult',
'visit',
'doctor',
'cost',
'percentage',
'people',
'regular',
'doctor',
'place',
'care',
'percentage',
'adults',
'age',
'received',
'influenza',
'vaccination',
'within',
'past',
'year',
'number',
'hospital',
'discharges',
'copd',
'people',
'age',
'older',
'per',
'population',
'number',
'hospital',
'discharges',
'asthma',
'people',
'age',
'older',
'per',
'population',
'number',
'hospital',
'discharges',
'diabetes',
'people',
'age',
'older',
'per',
'population',
'percentage',
'adults',
'report',
'regular',
'doctor',
'always',
'often',
'spent',
'enough',
'time',
'percentage',
'adults',
'report',
'regular',
'doctor',
'always',
'often',
'explains',
'things',
'way',
'easy',
'understand',
'percentage',
'older',
'adults',
'age',
'report',
'regular',
'doctor',
'always',
'often',
'gave',
'opportunity',
'ask',
'questions',
'raise',
'concerns',
'percentage',
'adults',
'report',
'regular',
'doctor',
'always',
'often',
'involved',
'much',
'wanted',
'decisions',
'care',
'treatment',
'year',
'relative',
'survival',
'rate',
'breast',
'cancer',
'number',
'deaths',
'due',
'breast',
'cancer',
'per',
'females',
'year',
'relative',
'survival',
'rate',
'cervical',
'cancer',
'number',
'deaths',
'due',
'cervical',
'cancer',
'per',
'females',
'year',
'relative',
'survival',
'rate',
'colorectal',
'cancer',
'number',
'deaths',
'due',
'colorectal',
'cancer',
'per',
'population',
'number',
'deaths',
'due',
'colorectal',
'cancer',
'per',
'population',
'percentage',
'patients',
'age',
'die',
'hospital',
'within',
'days',
'admitted',
'primary',
'diagnosis',
'acute',
'myocardial',
'infarction',
'ami',
'percentage',
'patients',
'age',
'die',
'hospital',
'within',
'days',
'admitted',
'primary',
'diagnosis',
'ischemic',
'stroke',
'percentage',
'patients',
'age',
'die',
'hospital',
'within',
'days',
'admitted',
'primary',
'diagnosis',
'ischemic',
'stroke',
'rate',
'foreign',
'body',
'left',
'inside',
'patient',
'body',
'procedure',
'per',
'hospital',
'discharges',
'age',
'rate',
'post',
'operative',
'pulmonary',
'embolism',
'per',
'discharges',
'hip',
'knee',
'replacement',
'age',
'rate',
'post',
'operative',
'sepsis',
'per',
'discharges',
'abdominal',
'surgery',
'age',
'percentage',
'vaginal',
'deliveries',
'third',
'fourth',
'degree',
'obstetric',
'trauma',
'per',
'instrument',
'assisted',
'vaginal',
'deliveries',
'percentage',
'vaginal',
'deliveries',
'third',
'fourth',
'degree',
'obstetric',
'trauma',
'per',
'vaginal',
'deliveries',
'without',
'instrument',
'assistance',
'percentage',
'patients',
'diabetes',
'prescription',
'first',
'choice',
'antihypertensive',
'medication',
'number',
'per',
'patients',
'age',
'prescriptions',
'daily',
'doses',
'benzodiazepines',
'related',
'drugs',
'number',
'per',
'patients',
'age',
'least',
'one',
'prescription',
'long',
'acting',
'benzodiazepines',
'related',
'drugs',
'total',
'volume',
'antibiotics',
'prescribed',
'systemic',
'use',
'defined',
'daily',
'doses',
'per',
'population',
'per',
'day',
'volume',
'second',
'line',
'antibiotics',
'percentage',
'antibiotics',
'prescribed',
'nan',
'nan',
'nan']# text = clean_text(raw_measure)
# text_list = text.split(" ")


# print(text_list) #
unigrams = ngrams(text_list, 1) # resulting object is an iterator
# bigrams = ngrams(text_list, 2) #
unigrams = list(ngrams(text_list, 1)) # resulting object is an iterator

#for uni in unigrams: #
#print(uni); #

freq = Counter(unigrams)
#print(freq) #

topN = freq.most_common()[1:20] #top frequent 20 words
#print(topN) #
wordscount = {w[0]:f for w, f in topN}
sorted_wordscount = sorted(wordscount.items(), key=operator.itemgetter(1),reverse=True)
#print(sorted_wordscount) #


## use pytag package
create_tag_image(make_tags(sorted_wordscount[:],maxsize=40), 'filename.png', size=(250,200), background=(0, 0, 0, 255), layout=LAYOUT_MIX, fontname='Molengo', rectangular=True)
Image("filename.png")
png
text_again = ''
for aWord in text_list:
if len(aWord) > 2:
text_again += ' ' + aWord

text_again

## using wordcloud package
wordcloud = WordCloud(max_font_size=40).generate(text_again)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## use custom scoring
wordscount = {w[0]:f for w, f in topN}

wordcloud = WordCloud(max_font_size=40)
wordcloud.fit_words(wordscount)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
png
png
# split text into sentences
# each sentence is a "market basket"
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
review_text = BeautifulSoup(raw_measure)
for script in review_text(["script", "style"]):
script.extract() # rip it out
# get text
review_text = review_text.get_text()
sentences = tokenizer.tokenize(review_text)


##naive implementation
word_association = Counter()
for sent in sentences:
bigrams = Counter(ngrams([w for w in sent.lower().split(" ") if not w in stopwords.words('english')], 2))
word_association.update(bigrams)

topN = word_association.most_common()[1:20]

G = nx.Graph()
for edge in topN:
G.add_edge(edge[0][0], edge[0][1], weight=edge[1])
pos=nx.circular_layout(G)

plt.figure(3,figsize=(6,6))
nx.draw(G, pos, with_labels = True, font_size=10, edge_color='blue', node_color='white', font_weight='bold')
plt.show()

# http://stackoverflow.com/questions/13429094/implementing-a-word-tree-using-nested-dictionaries-in-python
edges = []

word_sents_arrays = []
for sent in sentences[1:5]:
word_sents_arrays.append(clean_text(sent).split())
print(sentences[1:5])


mnn, edges= doall(word_sents_arrays,{})




G=nx.Graph()
G.add_edges_from(edges)
G1 = nx.Graph(nx.minimum_spanning_edges(G))
pos = hierarchy_pos(G1,'root')

plt.figure(3,figsize=(10,10))
nx.draw(G1, pos=pos, with_labels=True, edge_color='blue', node_size=30, font_size=10, node_color='white')
plt.show()
png
['Number of deaths due to cancer per 100,000 males.', 'Number of deaths due to ischemic heart disease \nper 100,000 population.Number of deaths due to ischemic heart disease \nper 100,000 population.Number of deaths due to ischemic heart disease \nper 100,000 population.Number of deaths due to ischemic heart disease \nper 100,000 population.', 'Number of deaths due to cerebrovascular diseases \nper 100,000 population.Number of deaths due to cerebrovascular diseases \nper 100,000 population.Number of deaths due to cerebrovascular diseases \nper 100,000 population.Number of deaths due to cerebrovascular diseases \nper 100,000 population.', 'Number of deaths due to transport accidents \nper 100,000 females.Number of deaths due to transport accidents \nper 100,000 females.Number of deaths due to transport accidents \nper 100,000 females.Number of deaths due to transport accidents \nper 100,000 females.']
png

reference:

  1. pd.merge reference: https://www.shanelynn.ie/merge-join-dataframes-python-pandas-index-1/ : though I could just read and string concatenate in a loop