<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/rss2full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.propublica.org/~d/styles/itemcontent.css"?><rss xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" version="2.0">

    <channel>
    
    <title>The ProPublica Nerd Blog</title>
    <link>http://www.propublica.org/nerds/</link>
    <description />
    <dc:language />
    <dc:creator>ProPublica</dc:creator>
    <dc:rights>Copyright 2013</dc:rights>
    <dc:date>2013-05-09T15:28:57-05:00</dc:date>
    <admin:generatorAgent rdf:resource="http://expressionengine.com/" />

    

	<atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.propublica.org/propublica/nerds" /><feedburner:info uri="propublica/nerds" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><item>
		<title>Resources for Investigating Tax-Exempt Organizations</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/RiNw3dwGCFs/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/resources-for-investigating-tax-exempt-organizations/#25743</guid>
		<description>&lt;p&gt;In addition to our &lt;a href="http://projects.propublica.org/nonprofits"&gt;Nonprofit Explorer&lt;/a&gt; interactive database, here are some resources for researching charities and other tax-exempt organizations.&lt;/p&gt;

&lt;h2&gt;Getting Started&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;The Reynolds Center for Business Journalism has an &amp;#8220;&lt;a href="http://businessjournalism.org/wp-content/uploads/2012/01/Investigating-Nonprofits-Campbell.pdf"&gt;Investigating Nonprofits Tipsheet&lt;/a&gt;&amp;#8221; as well as &lt;a href="http://businessjournalism.org/2012/01/31/investigating-private-companies-and-nonprofits-self-guided-training/"&gt;self-guided training on investigating nonprofits&lt;/a&gt; (as well as other private companies)&lt;/li&gt;
&lt;li&gt;The Foundation Center&amp;#8217;s &lt;a href="http://foundationcenter.org/findfunders/990finder/"&gt;990 Finder&lt;/a&gt; has information on U.S. charitable foundations, including financial reports.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.guidestar.org/"&gt;Guidestar Premium&lt;/a&gt; has sophisticated tools that let you compare nonprofits by financial health, number of full-time employees, board chairs, etc. &lt;/li&gt;
&lt;li&gt;The &lt;a href="http://www.nasconet.org/documents/u-s-charity-offices/"&gt;National Association of State Charity Officials&lt;/a&gt; links to state offices that regulate charitable organizations and charitable solicitations. &lt;/li&gt;
&lt;li&gt;You can request 990s or applications for exemptions directly from the &lt;a href="http://www.irs.gov/charities/article/0,,id=135033,00.html"&gt;Internal Revenue Service&lt;/a&gt;. The IRS website also allows for searching of political organization disclosures and has 990-n postcard searching. &lt;/li&gt;
&lt;li&gt;Some states, such as &lt;a href="http://www.secretary.state.nc.us/csl/Search.aspx"&gt;North Carolina&lt;/a&gt; require nonprofits to file their annual audits online. &lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;Diving Deeper&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;The &lt;a href="http://www-lib.iupui.edu/special/fc"&gt;Indiana University archives&lt;/a&gt; retain some historical non-profit filings, and you can submit search requests. &lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.npccny.org/new990/new990.htm"&gt;NCC of New York and Ford Foundation&lt;/a&gt; have a very in-depth guide on how to read a 990 to analyze their net assets, look for self-dealing, find out if any officers have left, etc. &lt;/li&gt;
&lt;li&gt;The law that covers tax-exempt organizations is &lt;a href="http://www.gpo.gov/fdsys/granule/USCODE-2011-title26/USCODE-2011-title26-subtitleA-chap1-subchapF-partI-sec501/content-detail.html"&gt;26 USC &amp;#167; 501 Exemption From Tax on Corporations, Certain Trusts, etc.&lt;/a&gt; &lt;/li&gt;
&lt;li&gt;The IRS publishes many useful guides for tax-exempt filers, which are helpful for those trying to understand how nonprofits work, including: &lt;a href="http://www.irs.gov/pub/irs-pdf/i990.pdf"&gt;Instructions for Form 990
Return of Organization Exempt From Income Tax&lt;/a&gt; and &lt;a href="http://www.irs.gov/pub/irs-pdf/p557.pdf"&gt;Publication 557: Tax-Exempt Status for Your Organization&lt;/a&gt; &lt;/li&gt;
&lt;li&gt;The Chronicle of Philanthropy publishes two particularly useful resources: &lt;a href="http://philanthropy.com/americagives"&gt;How America Gives&lt;/a&gt;, a database that looks at giving patterns in every city, state and neighborhood in the U.S., and &lt;a href="http://philanthropy.com/stats/topdonors/"&gt;America's Top Donors&lt;/a&gt;,  a database of gifts of $1 million or more to charities since 2003. &lt;/li&gt;
&lt;/ul&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/RiNw3dwGCFs" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-05-09T15:28:57-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/resources-for-investigating-tax-exempt-organizations/</feedburner:origLink></item>

	<item>
		<title>A Super-Simple Tool to Search Instagram by Time and Location</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/FC9ohBNpoQs/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/a-super-simple-tool-to-search-instagram-by-time-and-location/#25733</guid>
		<description>&lt;p&gt;This week, Justin Elliott wrote about new House Financial Services Committee chairman Rep. Jeb Hensarling (R-Texas) attending a &lt;a href="http://www.propublica.org/article/house-finance-chair-goes-on-ski-vacation-with-wall-street"&gt;weekend getaway with banking industry officials&lt;/a&gt;. &lt;/p&gt;
&lt;p&gt;One of the ways he found out who was at the getaway was by using the &lt;a href="http://www.instagram.com/"&gt;Instagram&lt;/a&gt; photo sharing service, which turned up a snowy snapshot taken by Len Wolfson, a lobbyist for the Mortgage Bankers Association (which had contributed to Hensarling's PAC). Wolfson has since set his account to private.&lt;/p&gt;
&lt;p&gt;The Instagram site has no search function, so finding shots like this can take a lot of digging. However, Instagram has an API with a &lt;a href="http://instagram.com/developer/endpoints/media/#get_media_search"&gt;"Media Search" endpoint&lt;/a&gt; that returns data both by timeframe and distance from a certain latitude and longitude -- a perfect way to see who's at a &lt;a href="http://politicalpartytime.org/"&gt;certain place at a certain time&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;We wrote a simple &lt;a href="http://sinatrarb.com"&gt;Sinatra&lt;/a&gt; app that uses this endpoint which we're calling &lt;a href="https://github.com/propublica/qis"&gt;QIS (Quick Instagram Search)&lt;/a&gt;. The API is pretty limited, so our tool is too. There's no way to search for text or hashtags (tags have their own endpoint which doesn't allow geolocation), there's no pagination of results, and results only go back a few months. We're using Google's Geocoder to take any place, landmark or address string and turn it into a latitude and longitude for Instagram. We're open-sourcing QIS today.&lt;/p&gt;
&lt;p&gt;Here's an example of a search for yesterday between Noon and 7 p.m. at the New Orleans Fair Grounds, where the New Orleans Jazz and Heritage Festival is currently going on:&lt;/p&gt;
&lt;p&gt;&lt;img alt="Jazzfest" src="http://propublica.s3.amazonaws.com/assets/nerds/quick-instagram-search-jazzfest.png" /&gt;&lt;/p&gt;
&lt;p&gt;Just playing around with QIS for a few minutes, we &lt;a href="http://instagram.com/p/YIsWXKoCV8/"&gt;found a shot&lt;/a&gt; of the finish line at the Boston Marathon, 8 minutes before the first bomb exploded, and &lt;a href="http://instagram.com/p/YokaFZlc_z/"&gt;a shot of Newt Gingrich&lt;/a&gt; outside the White House Correspondents' Dinner.&lt;/p&gt;
&lt;p&gt;To bootstrap QIS, see the README file in the &lt;a href="https://github.com/propublica/qis"&gt;repository&lt;/a&gt;.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/FC9ohBNpoQs" height="1" width="1"/&gt;</description>
		<dc:author>Al Shaw</dc:author>
		<dc:subject />
		<dc:date>2013-05-03T15:04:44-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/a-super-simple-tool-to-search-instagram-by-time-and-location/</feedburner:origLink></item>

	<item>
		<title>Tracking State Bills Takes a Village: The Legislature Tracker Gets Launched in N.Y.</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/nTuyy5Fw9zw/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/tracking-state-bills-takes-a-village/#25732</guid>
		<description>&lt;p&gt;I'm an interactive news developer at &lt;a href="http://www.minnpost.com"&gt;MinnPost&lt;/a&gt; in Minneapolis. I've been working in the Nerd Cube at ProPublica this week as a &lt;a href="http://www.propublica.org/about/p5-project"&gt;P5 Resident&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I spent the week working with other developers enhancing MinnPost&amp;#8217;s &lt;a href="http://www.minnpost.com/data/2013/04/minnesota-legislative-bill-tracker"&gt;Legislature Tracker&lt;/a&gt; so that any newsroom can easily use it to report on the work of their state legislature. &lt;/p&gt;

&lt;p&gt;The Legislature Tracker is an application that provides a curation layer on top of the vast amount of legislative data that is produced each session in all the 50 states around the country. It enables newsrooms, organizations or individuals to easily identify and keep track of bills that they or their audiences will find important.&lt;/p&gt;

&lt;p&gt;The goal of the week was to make the tool work for states other than Minnesota. That required some coding help but also needed expertise in a statehouse other than my own. So in addition to ProPublica's help we were really happy to turn to the energetic team at &lt;a href="http://www.thenewyorkworld.com"&gt;The New York World&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;After a week-long sprint, we accomplished two major things: We've made the Tracker easily customizable to work with any legislature; and the New York World is launching a &lt;a href="http://thenewyorkworld.com/public/2013/05/legislature-tracker/index.php"&gt;New York Legislation Tracker&lt;/a&gt; which highlights what their editors have decided are the important bills that are being debated in Albany.&lt;/p&gt;

&lt;p&gt;The MinnPost data team, Tom Nehil, Kaeti Hinck, and I, worked hard on developing the Legislature Tracker at the beginning of the year, and we were very excited to launch it last month. We saw early on that this sort of application could be very useful for other states and other groups, and we made many decisions to ensure that the application could be easily re-used and deployed. But due to deadlines and limited resources, we were not quite able to fully reach that goal. Fortunately I had the honor to come be a part of the P5 program here at Propublica and knew right away it would be a great opportunity to reach the goal of finishing what we started with Legislature Tracker.&lt;/p&gt;

&lt;p&gt;At the beginning of the week Scott Klein, Jeremy Merrill, and I were talking about what we could accomplish this week, and we realized that the easiest way to make sure we were making the Legislature Tracker widely applicable was to make one for another state. This would make it obvious if our assumptions were correct and if our code worked like we thought it did. Scott immediately thought of The New York World and called editor Alyssa Katz, who jumped at the chance. The next morning two terrific NY World staffers -- Beth Morrissey and Michael Sullivan -- were hacking with us in the nerd cube. Beth is a reporting fellow with knowledge about the New York State Legislature and had contacts to dig in deep on the subtleties of the state legislative process, as well as to identify the bills worth tracking. Mike, who is the New York World&amp;#8217;s tech-savvy deputy editor, hopped on to help code on the Tracker and deploy it for New York World.&lt;/p&gt;

&lt;p&gt;With this hard-working ad-hoc team, we sprinted all week to add features, fix bugs, identify bills, code in legislative process, and ultimately launch the Legislature Tracker on the New York World site.&lt;/p&gt;

&lt;p&gt;I came into the P5 program very excited and ready to make the Tracker project much more useful to everyone, but my expectations have been blown away and am very proud of what we were able to accomplish this week. Many thanks to MinnPost, Propublica, and New York World for the all their help and resources.  If you want to launch the Legislature Tracker in your state, &lt;a href="mailto:apalazzolo@minnpost.com"&gt;shoot me an email&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The Legislature Tracker is all open source. &lt;a href="http://minnpost.github.io/legislature-tracker/"&gt;Examples and documentation&lt;/a&gt; are over on Github.  There's still plenty to do to make it better, so any help is much appreciated.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/nTuyy5Fw9zw" height="1" width="1"/&gt;</description>
		<dc:author>ProPublica</dc:author>
		<dc:subject />
		<dc:date>2013-05-03T14:00:30-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/tracking-state-bills-takes-a-village/</feedburner:origLink></item>

	<item>
		<title>Latest P5 Resident Working on Open Source Legislation Tracker</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/wEU8E4cTJ54/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/latest-p5-resident-working-on-open-source-legislation-tracker/#25720</guid>
		<description>&lt;p&gt;Our P5 Resident this month is Alan Palazzolo from MinnPost. He&amp;#8217;s the fifth P5 Resident.&lt;/p&gt;

&lt;p&gt;Alan is an interactive news developer at MinnPost, where he builds open source &lt;a href="http://www.minnpost.com/data/" title="http://www.minnpost.com/data/"&gt;news applications and data visualizations&lt;/a&gt;. His projects have included a live &lt;a href="http://www.minnpost.com/results#dashboard" title="http://www.minnpost.com/results#dashboard"&gt;2012 Minnesota election results dashboard&lt;/a&gt; and an animated map of a day in the life of &lt;a href="http://www.minnpost.com/data/2012/06/day-life-nice-ride-bikes" title="http://www.minnpost.com/data/2012/06/day-life-nice-ride-bikes"&gt;Twin Cities &amp;#8220;Nice Ride&amp;#8221; bike share&lt;/a&gt;. Before joining MinnPost, he was part of the inaugural 2011 Fellow class at Code for America, working in Seattle, Wash. While at Code for America he helped build &lt;a href="http://changeby.us/" title="http://changeby.us/"&gt;Change by Us&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;While he's here in the nerd cube, he&amp;#8217;ll be refactoring his code for MinnPost&amp;#8217;s Minnesota &lt;a href="http://www.minnpost.com/data/2013/04/minnesota-legislative-bill-tracker#categories" title="http://www.minnpost.com/data/2013/04/minnesota-legislative-bill-tracker#categories"&gt;Legislative Bill Tracker&lt;/a&gt; so it can work for any state. It&amp;#8217;s already &lt;a href="https://github.com/MinnPost/legislature-tracker" title="https://github.com/MinnPost/legislature-tracker"&gt;available on GitHub&lt;/a&gt; but we&amp;#8217;re hoping by the end of the week to release a new version that will let anybody easily alter it to work with their state's legislative data. The tracker, which uses OpenStates data lets people focus on the bills that really matter to a publication or community.&lt;/p&gt;

&lt;p&gt;We&amp;#8217;re always on the lookout for new P5 Residents. Here are some &lt;a href="http://www.propublica.org/about/p5-project" title="http://www.propublica.org/about/p5-project"&gt;details on the program and how to apply&lt;/a&gt;.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/wEU8E4cTJ54" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-04-29T13:29:59-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/latest-p5-resident-working-on-open-source-legislation-tracker/</feedburner:origLink></item>

	<item>
		<title>ProPublica Projects—and P5 Projects—Finalists for Data Journalism Awards</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/jUF805PWLtc/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/propublica-projects-and-p5-projects-finalists-for-data-journalism-awards/#25719</guid>
		<description>&lt;p&gt;Over the weekend, the &lt;a href="http://www.globaleditorsnetwork.org/" title="http://www.globaleditorsnetwork.org/"&gt;Global Editors Network&lt;/a&gt; announced finalists for its &lt;a href="http://www.globaleditorsnetwork.org/dja/" title="http://www.globaleditorsnetwork.org/dja/"&gt;2013 Data Journalism Awards&lt;/a&gt;. &lt;/p&gt;

&lt;p&gt;On the shortlist are three projects from ProPublica: &lt;/p&gt;

&lt;ul&gt;&lt;li&gt;Maps &lt;a href="http://projects.propublica.org/graphics/westchester" title="http://projects.propublica.org/graphics/westchester"&gt;that retell&lt;/a&gt; the &amp;#8220;&lt;a href="http://projects.propublica.org/graphics/city-maps" title="http://projects.propublica.org/graphics/city-maps"&gt;Great Migration&lt;/a&gt;&amp;#8221; that took place in the mid-20th century, which saw African Americans migrate from the rural south to northern cities only to find themselves facing housing segregation in their new hometowns. The project, by Jeff Larson and Nikole Hannah-Jones, accompanied Hannah-Jones&amp;#8217;s &lt;a href="http://www.propublica.org/series/living-apart" title="http://www.propublica.org/series/living-apart"&gt;investigative project&lt;/a&gt; about the historical failure of the U.S. Department of Housing and Urban Development to take steps to end housing segregation.
&lt;/li&gt;&lt;/ul&gt;
&lt;ul&gt;&lt;li&gt;&lt;a href="http://projects.propublica.org/emails" title="http://projects.propublica.org/emails"&gt;Message Machine&lt;/a&gt;, a project to reverse engineer political messaging in the 2012 campaign. News app developer Jeff Larson developed a system that processed, in real-time, political emails sent in by readers, and tried to uncover the factors behind the sometimes subtle differences between the &amp;#8220;micro-targeted&amp;#8221; messages. With the help of more than 600 readers, we were able to uncover the versions of each email, and the demographic factors the campaigns used to tailor their messages while the campaign was still going on. To our knowledge this was the first time these techniques were used in a news application.
&lt;/li&gt;&lt;/ul&gt;
&lt;ul&gt;&lt;li&gt;The &lt;a href="http://www.propublica.org/nerds" title="http://www.propublica.org/nerds"&gt;ProPublica Nerd Blog&lt;/a&gt;, where we publish all of the methodologies, recipes and the secrets behind our news application development. It&amp;#8217;s always been an important component of ProPublica&amp;#8217;s mission to share our work. Our journalism is available to republish under a Creative Commons license, and has been so since we launched in  2008. The News Apps desk at ProPublica makes lots of our code available via GitHub. We also, after each big project launches, explain our techniques in nerdy detail via long technical posts. You can find all of this in the nerd blog.
&lt;/li&gt;&lt;/ul&gt;
&lt;p&gt;Also on the shortlist is an interactive project in the Washington Post by Sisi Wei, who now works at ProPublica, called &amp;#8220;&lt;a href="http://www.washingtonpost.com/wp-srv/special/politics/2012-exit-polls/" title="http://www.washingtonpost.com/wp-srv/special/politics/2012-exit-polls/"&gt;Exit Polls 2012: How the Vote Has Shifted&lt;/a&gt;.&amp;#8221; &lt;/p&gt;

&lt;p&gt;We&amp;#8217;re very pleased to be nominated alongside the amazing work of so many of our colleagues around the world. We especially wanted to point out two projects done by Residents from &lt;a href="http://www.propublica.org/nerds/item/p5"&gt;our P5 program&lt;/a&gt; who were also nominated: La Naci&amp;#243;n (Argentina) is nominated for their &lt;a href="http://blogs.lanacion.com.ar/ddj/data-driven-investigative-journalism/argentina-senate-expenses/" title="http://blogs.lanacion.com.ar/ddj/data-driven-investigative-journalism/argentina-senate-expenses/"&gt;remarkable work on Senate expenses&lt;/a&gt;, and Berliner Morgenpost is nominated for their &amp;#8220;&lt;a href="http://flugroutenradar.morgenpost.de" title="http://flugroutenradar.morgenpost.de"&gt;Flugrouten-Radar&lt;/a&gt;&amp;#8221; project, which got its start as Julius Troeger&amp;#8217;s P5 project while he sat in the ProPublica nerd cube here in New York.&lt;/p&gt;

&lt;p&gt;The winners of the 2nd annual Data Journalism Awards will be announced in Paris on &lt;a href="http://www.globaleditorsnetwork.org/gen2013/" title="http://www.globaleditorsnetwork.org/gen2013/"&gt;June 19&lt;/a&gt;. A &lt;a href="http://www.globaleditorsnetwork.org/wp-content/uploads/DJA%20PR%20Perugia%2027%20April.pdf" title="http://www.globaleditorsnetwork.org/wp-content/uploads/DJA%20PR%20Perugia%2027%20April.pdf"&gt;full list of nominees&lt;/a&gt; is available on the GEN website. &lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/jUF805PWLtc" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-04-29T13:25:38-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/propublica-projects-and-p5-projects-finalists-for-data-journalism-awards/</feedburner:origLink></item>

	<item>
		<title>HeartSaver: Experimenting with News Games to Tell a Story</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/DmNKVxX_t_8/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/heartsaver-an-experiment-in-using-news-games-to-tell-a-story/#25705</guid>
		<description>&lt;style&gt;
.gist .gist-file .gist-data { font-size: 12px !important; }
.bodytext ol { margin: 0 20px 0 20px !important; }
&lt;/style&gt;
&lt;p&gt;This past weekend, a team from ProPublica competed in the GEN &lt;a href="http://www.globaleditorsnetwork.org/editors-lab/nyt/"&gt;Editors&amp;#8217; Lab New York hack day&lt;/a&gt;, with the theme &amp;#8220;Newsgaming.&amp;#8221; We learned how to use game mechanics to create an interactive experience that went beyond badges and reputation systems to explore a complex accountability story in fun and engaging way. Seriously, our game is really fun. &lt;/p&gt;
&lt;p&gt;Here&amp;#8217;s what we learned (and produced!) in two days. &lt;/p&gt;

&lt;p&gt;The game, &lt;a href="http://projects.propublica.org/graphics/heartsaver"&gt;HeartSaver&lt;/a&gt;, aims to help players explore access to good emergency care in New York City, where nearly half a million people suffer from heart disease. We wondered: does everyone have access to high-quality care? What happens if I have a heart attack at work? At home? We invited players to explore this question by challenging them a mission: Save as many heart attack victims as possible. &lt;/p&gt;
&lt;p&gt;A player&amp;#8217;s score depends on how quickly they can get heart attack victims to the best available hospital. For victims, survival often hinges on how quickly they can get to the emergency room, with odds decreasing 7-10 percent every minute before defibrillation, &lt;a href="http://www.heart.org/idc/groups/heart-public/@wcm/@ecc/documents/downloadable/ucm_438703.pdf"&gt;according to the American Heart Association&lt;/a&gt;. And on arrival, quality of care plays an important part. We turned these factors into variables in the game. Players can explore how access to good emergency care works. The faster a player routes victims to quality care, the more lives they save. The more lives they save, the higher their score. How many lives do you think you could save? &lt;a href="http://sisiwei.github.io/gen-hackday-propublica/"&gt;Play the game and find out&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;News Games Tell a Story&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;While playing the game, players are telling themselves the story we want them to tell. It becomes clear very quickly which areas have fewer hospitals with emergency departments. When an icon representing a victim lands in Woodhaven, Cypress Hills or Whitestone, Queens, the closest hospital suddenly seems very far away. Having players experience the anxiety of seeing a victim with no hospitals nearby gives them an intuitive and memorable understanding of how the lack of emergency care affects neighborhoods -- much more so than they&amp;#8217;d get from an article or an interactive map. &lt;/p&gt;
&lt;p&gt;As the difficulty of HeartSaver increases, players are faced with two important and emotional decisions.&lt;/p&gt;
&lt;p&gt;First, if a victim appears close to a hospital with a below average heart-care rating, does the player risk a longer transportation time to bring the victim to a better hospital? The precious seconds players take to decide affect the victim&amp;#8217;s survival rate.&lt;/p&gt;
&lt;p&gt;Second, as the number of victims increase, it becomes difficult to prioritize them. Should a player triage victims based on which ones needed help first, or focus on helping those already close to a hospital? &lt;/p&gt;
&lt;p&gt;Forcing players to make these decisions lets them actually experience&amp;#160;the complexity and difficult decisions in the real system.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Using Real-ish Data in a Game&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;HeartSaver relies on three key data points: The time elapsed since each victim&amp;#8217;s 911 call (how long it takes the player to drop the victim icon onto a hospital), the time it would take to transport the victim from their location to that hospital in the real world, and that hospital&amp;#8217;s relative quality of care. &lt;/p&gt;
&lt;p&gt;In the game, a minute is compressed into a second. For every second that passes as the victim awaits help, survival odds decrease 10 percent. &amp;#160;For every second of game time spent driving, survival odds decrease by .58 percent. Since we couldn&amp;#8217;t find solid data connecting minutes of ambulance travel time with survival odds, the game uses the &lt;a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2464671/"&gt;lowest rate we could find&lt;/a&gt;. &amp;#160;We calculate the survival rate once the victim arrives at the hospital using that hospital&amp;#8217;s mortality rate. If you&amp;#8217;re a researcher and can help us answer this question more rigorously, &lt;a href="mailto:sisi.wei@propublica.org"&gt;let us know&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;We calculate transport time from a victim&amp;#8217;s location to the hospital the player drops them onto by passing addresses and coordinates to the &lt;a href="https://developers.google.com/maps/documentation/directions/"&gt;Google Directions API&lt;/a&gt;, which returns the estimated driving time.&lt;/p&gt;
&lt;p&gt;We also use Medicare&amp;#8217;s risk-adjusted &lt;a href="https://data.medicare.gov/Hospital-Compare/Hospital-Outcome-Of-Care-Measures/f24z-mvb9"&gt;30 day heart attack mortality rates&lt;/a&gt;&amp;#160;to factor hospital performance. In the game, a victim&amp;#8217;s post-travel survival chances are multiplied by Medicare&amp;#8217;s survival rate for heart attack victims at that hospital. Finally, there&amp;#8217;s an element of randomness applied after all of these factors are combined.&lt;/p&gt;
&lt;p&gt;Here&amp;#8217;s the full formula for how we calculate survival chances:&lt;/p&gt;
&lt;p&gt;&lt;script src="https://gist.github.com/ashaw/5439381.js"&gt;&lt;/script&gt;&lt;/p&gt;
&lt;p&gt;Some important caveats: We don&amp;#8217;t know if a hospital's mortality rate factors out things like response time and travel distance. Also, in the real world, there isn't just one ambulance dispatcher for the city, and you'll almost definitely end up at the &lt;a href="http://www.health.ny.gov/professionals/ems/policy/06-01.htm"&gt;nearest hospital to your current location&lt;/a&gt;. Finally, the American Heart Association &lt;a href="http://www.heart.org/idc/groups/heart-public/@wcm/@ecc/documents/downloadable/ucm_438703.pdf"&gt;statisitc&lt;/a&gt; we use refers to sudden cardiac arrests, which is different from heart attacks, though a heart attack can also lead to sudden cardiac arrest. Read more about that &lt;a href="http://www.aedbrands.com/resource-center/education/heart-attack-vs-sudden-cardiac-arrest/"&gt;here&lt;/a&gt;. This is a two-day hackathon project meant to explore how news games work, and not the results of months of rigorous research, so take things with a grain of salt (actually, you should probably &lt;a href="http://www.cdc.gov/bloodpressure/sodium.htm"&gt;go easy on the salt&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;In order to make sure that every &amp;#8220;patient&amp;#8221; landed in one of the New York City boroughs, and not, say, in the water or New Jersey, we used geographic shapefiles from the &lt;a href="http://www.nyc.gov/html/dcp/html/bytes/dwndistricts.shtml#bcd"&gt;NYC Department of City Planning&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;News Games vs. Interactive Graphics&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;We could have made a great interactive graphic on heart disease and emergency care in New York, plotting hospital locations and quality of care ratings relative to reader locations. But that&amp;#8217;s not a game.&lt;/p&gt;
&lt;p&gt;On the first day of the hackathon, Columbia Journalism School professor &lt;a href="http://susanemcgregor.com/"&gt;Susan McGregor&lt;/a&gt;&amp;#160;spoke to the teams &lt;a href="https://docs.google.com/presentation/d/11ZettJoSkuL5RttTo_FiiKI5dXEPyjiVg3E2OBu04DY/edit#slide=id.p"&gt;about game design&lt;/a&gt;, and three of the criteria really stood out to us.&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Games have an objective: Some games like &lt;a href="https://minecraft.net/"&gt;Minecraft&lt;/a&gt;&amp;#160;can pull off being a &lt;a href="http://en.wikipedia.org/wiki/Sandbox_game"&gt;sandbox&lt;/a&gt;, which is basically an open-ended game where players can do anything they want, without any goals. But for most games, the objective is what makes it fun, because people enjoy the feeling that comes with overcoming challenges.&lt;/li&gt;
&lt;li&gt;Result of the game is unknown: Players don&amp;#8217;t know ahead of time how the game will end, and more importantly, players can take actions that impact the outcome. The game isn&amp;#8217;t passive, instead it actively engages the player in a series of decisions and actions toward a final goal.&lt;/li&gt;
&lt;li&gt;Creating the feeling of &amp;#8220;together, apart&amp;#8221;: A great game allows players to feel a sense of community, even though they are sitting with their own devices, looking at their own screen. Think of the success of massive multiplayer online games (or MMOs), such as the &lt;a href="http://us.battle.net/wow/en/"&gt;World of Warcraft&lt;/a&gt;. Our goals for HeartSaver were much more scaled back, but we wanted to create a &amp;#8220;together-apart&amp;#8221; feeling by giving players statistics on how their score compared with others who played.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;If you&amp;#8217;re interested in learning more about news games, Sisi also gave a &lt;a href="http://vimeo.com/61297159"&gt;lightning talk&lt;/a&gt; on how to make them at the NICAR 2013 conference.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Next Steps&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Because this was a two day hackathon project, we were pretty constrained as to what we were able to accomplish. If we had more time, we would have been much more rigorous with the data, and added more opportunities for players to see access to high-quality care near them. &lt;/p&gt;
&lt;p&gt;Also, we would&amp;#8217;ve allowed players to input their own addresses for work and home (or link up their Foursquare account to detect popular locations) to see how far&amp;#160;their nearest hospital was. Then, we would&amp;#8217;ve encouraged them to share and check hospital distances for family and friends. Once we&amp;#8217;ve got readers&amp;#8217; attention with the game, we would have loved to show them an immersive interactive graphic&amp;#160;to show how quality of care in the city affects them personally.&lt;/p&gt;
&lt;p&gt;We want to thank the Global Editors&amp;#8217; Network for such a great event, as you can tell, we&amp;#8217;re pretty excited about what news games can do. If you haven&amp;#8217;t already, check out &lt;a href="http://projects.propublica.org/graphics/heartsaver"&gt;HeartSaver&lt;/a&gt;, which is currently still a prototype, and make sure to play the &lt;a href="http://www.globaleditorsnetwork.org/editors-lab/nyt/"&gt;all the other games&lt;/a&gt;&amp;#160;that came out of the hackathon!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/DmNKVxX_t_8" height="1" width="1"/&gt;</description>
		<dc:author>Al Shaw</dc:author>
		<dc:subject />
		<dc:date>2013-04-23T10:51:25-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/heartsaver-an-experiment-in-using-news-games-to-tell-a-story/</feedburner:origLink></item>

	<item>
		<title>Meet the New Blog (Same as the Old Blog)</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/9aXtD_bRLws/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/meet-the-new-blog-same-as-the-old-blog/#25665</guid>
		<description>&lt;p&gt;Today we debut a ProPublica Nerd Blog redesign, which includes a new look for our main landing page as well as our article pages. We wanted to create both a better reading experience, as well as to help you find what you&amp;#8217;re looking for more easily. Here are three new features we want to highlight:&lt;/p&gt;

&lt;p&gt;First, we created &lt;strong&gt;a new sidebar&lt;/strong&gt; to help you quickly locate our tools and guides, including ProPublica's latest &lt;a href="http://www.propublica.org/nerds/item/propublicas-news-app-guides"&gt;News Apps and Data Guides&lt;/a&gt;. It&amp;#8217;s powered by the &lt;a href="http://developer.github.com/v3/"&gt;Github API&lt;/a&gt;, so anything we open source will instantly be added there. We've also added a section called "&lt;strong&gt;Explore Our Work&lt;/strong&gt;" that gives you another way to navigate through our stuff.&lt;/p&gt;

&lt;p&gt;Second, we're &lt;strong&gt;cross-posting announcements all of our news apps and graphics&lt;/strong&gt; here. This makes the Nerd Blog a one-stop shop for all of our projects and the technical posts we write about them. Look for the gold boxes for new apps.&lt;/p&gt;

&lt;p&gt;Third, we added simple &lt;strong&gt;keyboard shortcuts&lt;/strong&gt; to the landing page to help you navigate more quickly. You can press "j" to skip to the next blog post, and "k" to go back. &lt;a href="http://www.vim.org/"&gt;Vim&lt;/a&gt; fans should be delighted.&lt;/p&gt;

&lt;p&gt;We hope you like the new look, as well as our new "data bass" icon, drawn by &lt;a href="http://jasondas.com/"&gt;Jason Das&lt;/a&gt;, and we welcome suggestions on how to make the nerd blog even better in the comment section -- now powered by Disqus -- below.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/9aXtD_bRLws" height="1" width="1"/&gt;</description>
		<dc:author>Sisi Wei</dc:author>
		<dc:subject />
		<dc:date>2013-04-16T10:07:40-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/meet-the-new-blog-same-as-the-old-blog/</feedburner:origLink></item>

	<item>
		<title>Heart of Nerd Darkness: Why Updating Dollars for Docs Was So Difficult</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/p2zN0DKeGfQ/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/heart-of-nerd-darkness-why-dollars-for-docs-was-so-difficult/#25619</guid>
		<description>&lt;p&gt;Last week we published a big update to &lt;a href="http://projects.propublica.org/docdollars"&gt;Dollars for Docs&lt;/a&gt;, our interactive news application of payments made to U.S. health-care providers by 15 pharmaceutical companies. Compared to when we launched the project in 2010, the amount of data we&amp;rsquo;re collecting has grown enormously: The list of payments increased from around 750,000 to almost 2 million, and the grand total of the payments grew from around $750 million to just under $2 billion.&lt;/p&gt;
&lt;p&gt;Compiling the data for it has been an enormous project right from the beginning. After we published the first version, the original developer on the project, Dan Nguyen, compiled all of the things he had to learn into a &lt;a href="http://www.propublica.org/nerds/item/doc-dollars-guides-collecting-the-data"&gt;guide to scraping data&lt;/a&gt;. This year&amp;rsquo;s update took more than eight months of full-time work by me, working with other news-app developers, and at times with our CAR team, a researcher, two editors and two health-care reporters. It was a massive effort and presented huge technical and journalistic challenges.&lt;/p&gt;
&lt;p&gt;After we launched, my editor pulled me aside and asked what was so hard about Dollars for Docs. What follows is my answer.&lt;/p&gt;
&lt;h2&gt;
	PDFs Considered Harmful&lt;/h2&gt;
&lt;p&gt;To build the database that powers Dollars for Docs, we had to convert 65 gigantic disclosure reports from some other format into raw structured data. Each company releases its data in a slightly different way, but broadly speaking there are two ways they present the data: A few build websites to publish the data, and many more disclose their payments in humongous Adobe Acrobat PDF files.&lt;/p&gt;
&lt;p&gt;The trouble is, PDF was not designed as a data format. It was designed as an &amp;quot;electronic paper&amp;quot; format. That is, &amp;quot;something whose contents (text and 2D images) would look the same on any computer at any time,&amp;quot; Adobe Senior Product Marketing Manager Ali Hanyaloglu told me.&lt;/p&gt;
&lt;p&gt;PDFs are a format engineered to present elements in perfect fidelity to their creator&amp;#39;s intentions. In their most basic form, PDFs don&amp;rsquo;t know what tabular data is. They don&amp;rsquo;t even know what words are.&lt;/p&gt;
&lt;p&gt;Here&amp;rsquo;s how a PDF works, deep down: It positions text by placing each character at minutely precise coordinates in relation to the bottom-left corner of the page. It does something similar for other elements like images. A PDF knows about shapes, characters and their precise positions on the page. Even if a PDF looks like a spreadsheet -- in fact, even when it&amp;rsquo;s made using Microsoft Excel -- the PDF format doesn&amp;rsquo;t retain any sense of the &amp;ldquo;cells&amp;rdquo; that once contained the data.&lt;/p&gt;
&lt;p&gt;Adobe, which invented the PDF in 1993, acknowledges its shortcomings when it comes to data. &amp;ldquo;For the person who wants raw data, PDF isn&amp;rsquo;t the right choice,&amp;quot; wrote Adobe Senior Principal Scientist Jim King in a &lt;a href="http://blogs.adobe.com/insidepdf/2011/10/my-pdf-hammer-revision.html"&gt;blogpost&lt;/a&gt;. In 2000, Adobe made an updated version of the PDF format with an easy way to &lt;a href="http://blogs.adobe.com/insidepdf/2010/11/pdf-file-attachments.html"&gt;attach files&lt;/a&gt; so that original data could be published alongside an attractive presentation.&lt;/p&gt;
&lt;p&gt;Incidentally, Adobe made doing this really easy. We made &lt;a href="http://youtu.be/CKDWr1h8Y9c"&gt;a screencast showing how&lt;/a&gt;. According to Hanyaloglu, it can be done using Adobe Acrobat, Microsoft Office or even with the free Adobe Reader. I&amp;rsquo;m not quite sure why I don&amp;rsquo;t see PDFs with attached data in the wild more often. In any event, none of the 50 PDFs we used to compile Dollars for Docs database attached the raw data.&lt;/p&gt;
&lt;p&gt;&lt;iframe allowfullscreen="" frameborder="0" height="473" src="http://www.youtube.com/embed/CKDWr1h8Y9c" style="margin-bottom: 12px;" width="620"&gt;&lt;/iframe&gt;&lt;/p&gt;
&lt;p&gt;For each PDF, we had to reconstruct the raw data, and it&amp;rsquo;s not an easy thing to do. We don&amp;rsquo;t know of any off-the-shelf software that did precisely what we needed to do, so we wrote our own. We named it &amp;quot;&lt;a href="http://www.merriam-webster.com/dictionary/farrago"&gt;Farrago&lt;/a&gt;.&amp;quot; The first version was built by &lt;a href="http://frackman.org/blog/"&gt;David Frackman&lt;/a&gt;, a New York-based developer who recognized this as a problem that could be addressed with a computer science discipline called &lt;a href="http://en.wikipedia.org/wiki/Computer_vision"&gt;computer vision&lt;/a&gt;, which is used for things like facial recognition and automatic license-plate readers.&lt;/p&gt;
&lt;p&gt;Farrago reconstructs the data from a PDF by considering each character one by one to determine which column it belongs in. It uses a computer vision technique called a &lt;a href="http://en.wikipedia.org/wiki/Hough_transform"&gt;Hough Transform&lt;/a&gt; to detect the vertical black lines in the background of the PDF that visually separate columns in the table.&lt;/p&gt;
&lt;p&gt;&lt;img alt="A picture of Hough Transform lines on an image of a PDF page" src="http://propublica.s3.amazonaws.com/assets/nerds/d4d-hard/d4d-pic-1.png" title="" / style="width: 620px;"&gt;&lt;/p&gt;
&lt;p&gt;But not all PDFs have borders around each table cell. For these PDFs, we had to visually guess the locations of these dividing lines and adjust the code again and again until it came out right. Unfortunately, there&amp;#39;s no easy way to check if tens of thousands of lines were all copied correctly. Many times we had to reprocess a file because one person&amp;#39;s long name was inadvertently split, with most of their name in the name column and the last few letters in the next column. For example, an incorrect column dividing line location for a payment to a doctor with the last name &amp;quot;Ingemar&amp;quot; in Salt Lake City might end up listing &amp;quot;INGE&amp;quot; as a last name and the city as &amp;quot;MARSALT LAKE CITY.&amp;rdquo;&lt;/p&gt;
&lt;p&gt;&lt;img alt="An incorrectly located column boundary" src="http://propublica.s3.amazonaws.com/assets/nerds/d4d-hard/d4d-pic-2.png" title="" /&gt;&lt;/p&gt;
&lt;p&gt;Even when Farrago has an accurate model for which columns everything belongs in, things aren&amp;#39;t easy. As I mentioned above, each letter is positioned on the page independently from other letters in the documents. That&amp;#39;s more inconvenient than it may sound: While letters like &amp;quot;a&amp;quot; or &amp;quot;c&amp;quot; have about the same location on the y-axis if they are on the same line, letters like &amp;ldquo;p&amp;rdquo; or &amp;quot;g&amp;quot; with &lt;a href="http://en.wikipedia.org/wiki/Descenders"&gt;descenders&lt;/a&gt; are a tiny bit closer to the bottom of the page than letters without descenders, like h and m. Punctuation marks like commas and periods also behave unexpectedly, for the same reason. Farrago &amp;quot;rolled up&amp;quot; the letters within an arbitrary number of pixels of the &lt;a href="http://en.wikipedia.org/wiki/Baseline_(typography"&gt;baseline&lt;/a&gt; into a single line of text.&lt;/p&gt;
&lt;p&gt;&lt;img alt="Descenders picture" src="http://propublica.s3.amazonaws.com/assets/nerds/d4d-hard/Typography_Line_Terms.png" title="" /&gt;&lt;/p&gt;
&lt;p&gt;Here&amp;rsquo;s another mystery we found. Some PDFs we processed seem to contain duplicate lines of text that weren&amp;rsquo;t there if you just looked at the PDF. We had no idea why. The &lt;a href="http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/pdf_reference_1-7.pdf"&gt;PDF specification&lt;/a&gt;, which is the official recipe for how to write a program to display, print and create PDFs, made no mention of this phenomenon.&lt;/p&gt;
&lt;p&gt;&lt;a href="http://www.dbai.tuwien.ac.at/staff/hassan/files/p47-hassan.pdf"&gt;A paper by Tamir Hassan&lt;/a&gt;, a computer scientist whose research focuses on table recognition and information extraction, may have solved the mystery. &amp;ldquo;Characters (or complete strings),&amp;rdquo; he wrote, &amp;ldquo;are sometimes overprinted with a slight offset to simulate boldface type. As long as these instructions follow another, they are automatically detected and represented by a single text fragment with the boldface flag set to true.&amp;rdquo;&lt;/p&gt;
&lt;p&gt;Finally, because PDFs don&amp;rsquo;t know about words, they often don&amp;rsquo;t include a space character separating them. Instead, PDFs simply locate each group of letters a few pixels to the right of the one that precedes it. A reader can&amp;rsquo;t tell the difference, but to a computer the difference is a big pain. We used some PDF-reading software libraries that made sometimes imperfect guesses about where to put spaces in.&lt;/p&gt;
&lt;p&gt;Even after we had Farrago doing the heavy lifting, it often took hours to precisely tune how it processed a given PDF, and it sometimes took days to discover subtle problems and write a fix or workaround.&lt;/p&gt;
&lt;h2&gt;
	Breaking the Web&lt;/h2&gt;
&lt;p&gt;If they follow best practices, websites are easy to turn into structured data. But pharmaceutical companies&amp;#39; websites don&amp;#39;t always follow best practices. Some clumsily reinvent the basic mechanics of the Internet and make &amp;ldquo;scraping&amp;rdquo; the data out of them exceedingly difficult.&lt;/p&gt;
&lt;p&gt;One company&amp;#39;s disclosure site spanned 4,000 web pages that all shared a single URL. That is, there was no way to get to page 3,000 without viewing page one, then page two, then all the intervening pages through page 2,999. This wouldn&amp;#39;t be so bad, except the company&amp;#39;s web servers often got overwhelmed and returned a broken page, forcing us to start over.&lt;/p&gt;
&lt;p&gt;After I re-engineered the code to work around the site&amp;#39;s limitations with my colleague Jeff Larson&amp;#39;s help, the server would still occasionally get confused, and in unpredictable ways. If we requested, for instance, pages three and four, the server might return pages labeled &amp;quot;3&amp;quot; and &amp;quot;4&amp;quot; but both containing the data intended to be on page four.&lt;/p&gt;
&lt;p&gt;Another company&amp;#39;s disclosures seemed, at first, to all be on a single 29.6 megabyte web page. It took more than two and a half minutes to download and render in Google Chrome -- on a very fast Internet connection. But it turns out this behemoth of a web page did not even include all of the disclosure data. It included only the total amount paid to each payee. The details about each payment were on a fragment of HTML that appeared when a user clicked a link. We had to download all 81,335 of these separately and merge them with the list of payees so that we could report which portion of each payment was for meals, consulting, speaking, travel, etc.&lt;/p&gt;
&lt;h2&gt;
	It Gets Worse&lt;/h2&gt;
&lt;p&gt;Even when all the data was downloaded, we faced a row of hurdles.&lt;/p&gt;
&lt;p&gt;There is no standard way for companies to define each kind of payment -- at least, not yet. The 15 companies describe categories of payments in different ways and some even define categories differently between their own reports. For instance, while many companies disclose the value of meals given to doctors, Merck combines meals, travel expenses and royalties, so there is no way to break out the value of meals paid for by Merck to compare it to similar expenses disclosed by other companies. In all, we tracked 155 distinct category definitions across all of the companies in Dollars for Docs. The specific definition created by each company for its payment categories is available as a &amp;ldquo;tool-tip&amp;rdquo; next to each payment on Dollars for Docs payment pages.&lt;/p&gt;
&lt;p&gt;Another hurdle was dealing with the small number of companies that disclosed payments as ranges rather than precise dollar amounts. For instance, Allergan &lt;a href="http://projects.propublica.org/docdollars/payments/9851361"&gt;reported paying between $1 and $1,000&lt;/a&gt; to a doctor named Aaron Jacob Friedman in Lafayette, La., for &amp;quot;business meals&amp;quot; in 2011. There is, of course, no way of knowing if that was lunch at McDonald&amp;rsquo;s for $3 or dinner at Le Cirque for $999. Multiplied by the 95,582 payments in Allergan&amp;rsquo;s disclosures, this was a big problem. In the end we reported the ranges as we found them and excluded all the ranged payments from our aggregate numbers, like state and company totals.&lt;/p&gt;
&lt;p&gt;Because any doctor&amp;#39;s name is likely to be reported differently by each drug company, we had to be careful about how we reported what was paid to each doctor. Some companies list middle initials and others don&amp;rsquo;t. Some checks appear to have been written out personally to a doctor, others to the doctor&amp;#39;s practice. Some doctors&amp;#39; addresses are listed in multiple cities. Manually researching millions of records to determine which slightly different names are really the same person and which doctors share a name but are different people would be an impossible task. We chose not even to attempt it and instead followed the example set by the Center for Responsive Politics&amp;#39; &lt;a href="http://www.opensecrets.org/"&gt;OpenSecrets&lt;/a&gt; site. We show names as we find them and depend on users to know which records in a search result page refer to their doctor.&lt;/p&gt;
&lt;p&gt;For doctors who stood out -- like the 22 doctors who earned more than &lt;a href="http://www.propublica.org/article/dollars-for-docs-the-top-earners"&gt;half a million dollars&lt;/a&gt; -- our reporters confirmed that the payments were all to the same person.&lt;/p&gt;
&lt;p&gt;All of this made comparing the data and drawing conclusions very complicated. For instance, because we couldn&amp;rsquo;t with high confidence group all of a doctor&amp;rsquo;s payments together, we couldn&amp;rsquo;t report the highest paid doctor in each state. Luckily, some of the &lt;a href="http://projects.propublica.org/docdollars/#local-stories"&gt;news organizations who&amp;rsquo;ve done stories based on Dollars for Docs data&lt;/a&gt; did the necessary legwork and do name the most highly paid docs. We also could not compare some companies payments to those made in a previous year. For instance, Merck&amp;#39;s 2012 payments can&amp;rsquo;t be compared to its 2011 payments because they changed what types of payments they disclosed. Nor could we compare companies payments against each other, as each company discloses different types of payments.&lt;/p&gt;
&lt;h2&gt;
	Making Sure We Get It Right&lt;/h2&gt;
&lt;p&gt;After the data was collected, we worked very hard to make sure that every data point in our database was accurately reconstructed from its source. The techniques we used are detailed in my colleague Jennifer LaFleur&amp;#39;s &lt;a href="https://github.com/propublica/guides/blob/master/data-bulletproofing.md"&gt;data bulletproofing guide&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;During more than a dozen rounds of spot checking, my fellow ProPublicans used statistical sampling to make random selections of hundreds of records at a time and manually compared them to the record in the original PDF or website. We also manually recomputed every sum and average -- for aggregates like companies and states -- using Microsoft Access and compared them to the version calculated by our Ruby code.&lt;/p&gt;
&lt;p&gt;We also used a SQL &amp;ldquo;GROUP BY&amp;rdquo; query to get all of the unique state abbreviations in the data set. This helped us locate errors like &amp;quot;M I&amp;quot; (with a space) instead of &amp;quot;MI&amp;quot; (without the space) for Michigan. We then sorted every column to make sure that it didn&amp;#39;t include numbers in alphabetical columns, like city names, or letters in the amount column.&lt;/p&gt;
&lt;p&gt;Another important component of our data-bulletproofing system was keeping our code and data well-organized. Each step of the data-cleaning process had its own script. For instance, one script used Farrago to scrape a PDF and filtered out header rows. For each step, we saved each of these scripts&amp;#39; output. This made it easy to track down errors. If, during one of our spot checks, we found a line that was missing in our final data that was present in the source data, we could look at each step of the data-cleaning process for that document to pinpoint which script had introduced the problem. After we had fixed the problem, the whole process could be re-run with just one command.&lt;/p&gt;
&lt;h2&gt;
	Help is On the Way&lt;/h2&gt;
&lt;p&gt;The Physician Payment Sunshine Act may resolve many of the problems we had. A provision of the Affordable Care Act, the PPSA will require pharmaceutical companies and medical device manufacturers to disclose their payments to doctors directly to the government, which will in turn put them on a public website that makes the data &amp;quot;downloadable and easily aggregated&amp;quot; -- that is, as structured data. (Speaking of disclosures, here&amp;rsquo;s one from us: ProPublica has been consulting with Deloitte, who is bidding on a contract to build that website.)&lt;/p&gt;
&lt;p&gt;Drug companies and manufacturers will also be required to disclose, on an established timeline with specific definitions, precise payment amounts and to disclose a doctor&amp;#39;s middle initial, if they know it. Companies&amp;#39; disclosures will also have to include each doctor&amp;#39;s National Provider Identifier, which is a unique numeric identifier for medical professionals, though that number won&amp;rsquo;t be made public. So when PPSA is fully implemented in mid-2014, the payments will be comparable between companies and across time periods.&lt;/p&gt;
&lt;p&gt;Until then, we&amp;rsquo;ll still be here doing things the hard way. Despite all of the problems, we believe strongly that Dollars for Docs is an important tool to ensure that Americans can have full and frank discussions with their doctors about their financial relationships.&lt;/p&gt;
&lt;h2&gt;
	One More Thing&lt;/h2&gt;
&lt;p&gt;Manuel Aristar&amp;aacute;n, a Knight-Mozilla OpenNews Fellow at La Naci&amp;oacute;n, a newspaper in Buenos Aires, Argentina, has been developing a tool called &lt;a href="http://tabula.nerdpower.org/"&gt;Tabula&lt;/a&gt; for extracting tabular data from PDFs. Tabula&amp;#39;s interface and algorithm for assembling characters into lines of text are much more developed than Farrago&amp;rsquo;s, so we&amp;rsquo;re helping fold Farrago into Tabula. Look for an open-source release of Tabula very soon.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Update&lt;/strong&gt;: &lt;a href="http://source.mozillaopennews.org/en-US/articles/introducing-tabula/"&gt;Tabula has been released&lt;/a&gt;.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/p2zN0DKeGfQ" height="1" width="1"/&gt;</description>
		<dc:author>Jeremy B. Merrill</dc:author>
		<dc:subject />
		<dc:date>2013-03-25T11:24:46-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/heart-of-nerd-darkness-why-dollars-for-docs-was-so-difficult/</feedburner:origLink></item>

	<item>
		<title>No Windows. One Exit. Free Drinks: Casino-Driven Design for Crowdsourcing</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/vYEdbw5RCU4/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/casino-driven-design/#25614</guid>
		<description>&lt;style&gt;
.article .article-inline-image img { width:630px; max-width:630px; border:1px solid #cecece; }
&lt;/style&gt;
&lt;p&gt;During the 2012 election, we created &lt;a href="https://projects.propublica.org/free-the-files"&gt;Free the Files&lt;/a&gt;, an interactive news application based on crowdsourced data, built in real time by thousands of volunteers. It was a collaborative effort to track TV ad spending by campaigns, super PACs and so-called &amp;#8220;dark money&amp;#8221; nonprofit groups in the country&amp;#8217;s top swing markets.&lt;/p&gt;

&lt;p&gt;Measured by participation rate, Free the Files was an &lt;a href="http://www.propublica.org/article/crowdsourcing-campaign-spending-what-we-learned-from-free-the-files"&gt;astonishing success&lt;/a&gt;. More than 1,000 contributors submitted over 94,000 transcriptions to help turn messy invoices from local TV networks into clean data. One volunteer transcribed over 28,000 filings. Each transcription was &amp;#8220;verified&amp;#8221; after two or more users agreed on all of its data points. There are currently around 17,000 verified filings, and people are still working.&lt;/p&gt;

&lt;p&gt;Much of this success came from the efforts of our expert &lt;a href="http://www.propublica.org/getinvolved"&gt;engagement team&lt;/a&gt;, who motivated and interacted with our volunteers every day, and who hatched a clever campaign of social media, contests and promises of free T-shirts. Our users were, of course, also motivated by their own sense of civic responsibility, wanting to help build the first free database of political TV ad spending.&lt;/p&gt;

&lt;p&gt;But design played a significant role as well. We kept our eye on optimizing each page for participation. We called the design we devised for participation-oriented areas of the site &amp;#8220;Casino-Driven Design.&amp;#8221; A variant of &lt;a href="http://schedule.sxsw.com/2011/events/event_IAP7330"&gt;Behavior Design&lt;/a&gt;, Casino-Driven Design cuts away all distraction and drives the user&amp;#8217;s attention toward staying focused on a single task.&lt;/p&gt;

&lt;h2 id="no_windows_one_exit_free_drinks"&gt;No Windows. One Exit. Free Drinks.&lt;/h2&gt;

&lt;p&gt;Why do we call it casino-driven design? Casinos are notorious for adopting an interior design that keeps people gambling. There are no windows and no clocks so it&amp;#8217;s easy to lose track of how long one has been gambling.&lt;/p&gt;

&lt;p&gt;Casino-driven design creates an optimal atmosphere for task completion by actively discouraging cross-site exploration and page exits. And, like in a real casino, we keep the small rewards flowing, such as seeing your name on &amp;#8220;freed file&amp;#8221; &lt;a href="https://projects.propublica.org/free-the-files/filings/28099"&gt;pages&lt;/a&gt; and on a &lt;a href="https://projects.propublica.org/free-the-files/leaderboard"&gt;leaderboard&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;There are no site-wide template elements on casino-driven pages &amp;#8212; no section links, ads or even a link to the homepage. Casino-driven pages start out as a blank white page. We add only the elements necessary for users to understand where they are and the nature of the next task. The only way off the page is to complete the task or abandon it.&lt;/p&gt;

&lt;p&gt;But there&amp;#8217;s more to casino-driven design than just a clean page.&lt;/p&gt;

&lt;h2 id="glass_doors"&gt;Glass Doors&lt;/h2&gt;

&lt;p&gt;There are two important goals in getting people to complete crowdsourcing tasks: Getting them to start volunteering, and getting them to keep going.&lt;/p&gt;

&lt;p&gt;We tried out a bunch of strategies for getting people into the app for the first time, but our favorite was the &amp;#8220;glass door&amp;#8221;: We show a tantalizing taste of the activities that will become available after a user takes the first step &amp;#8212; signing up:&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/nerds/casino-glassdoor.png" alt="What's behind the frosted glass door?" title="" /&gt;&lt;/p&gt;

&lt;p&gt;Behind the &amp;#8220;frosted glass&amp;#8221; is a view of what you&amp;#8217;ll be able to do once you&amp;#8217;ve logged in. It&amp;#8217;s hard to get to that screen and &lt;em&gt;not&lt;/em&gt; want to see what&amp;#8217;s behind it.&lt;/p&gt;

&lt;p&gt;We also helped our readers find an activity using yellow and blue wayfinding at the top of each page:&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/nerds/casino-wayfinding-bars.png" alt="Wayfinding bars with real-time status data encouraged participation." title="" /&gt;&lt;/p&gt;

&lt;p&gt;Yellow status bars across all of our apps mean &amp;#8220;this stuff is about you.&amp;#8221; In Free the Files, we used it to give people quick links to logging in, an at-a-glance look at how many files they&amp;#8217;ve freed, and what&amp;#8217;s coming up. Light blue boxes and bars across our apps mean &amp;#8220;here&amp;#8217;s the most important thing you can do&amp;#8221; (we call this shade of blue &amp;#8220;do-something blue&amp;#8221;). When we launched an election-day contest to transcribe all the files in Las Vegas, we put a blue bar on top of the pages to let them know there was an important activity to take part in. By the end of the single-day Las Vegas challenge, we had transcribed every file in the market.&lt;/p&gt;

&lt;h2 id="low_friction"&gt;Low Friction&lt;/h2&gt;

&lt;p&gt;Once inside, we keep the click targets really big and bright, and the number of actions as low as we can get away with. We spent a long time editing down the number of data elements we asked our readers to help us transcribe. In the end we pared down to four elements &amp;#8212; and after launch we found ways to cut that down even lower. &lt;/p&gt;

&lt;p&gt;Once the user has transcribed them, he or she can mash the big green or red button. Quite satisfying!&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/nerds/casino-doc-do.png" alt="A document &amp;quot;do&amp;quot; page." title="" /&gt;&lt;/p&gt;

&lt;p&gt;To keep people around longer, we engineered the page never to fully reload. There was never a reason for people to leave the page &amp;#8212; we provided pop-up instructions and autocomplete boxes for values we thought users may type in. For example, we preloaded the &amp;#8220;Who bought it&amp;#8221;  autocomplete field with committee names from both the FEC&amp;#8217;s database and the FCC&amp;#8217;s file naming scheme so users could start typing and choose one rather than try to decode the often confusing forms.&lt;/p&gt;

&lt;p&gt;Finally, we borrowed a trick from the gaming world by adding a &lt;a href="https://projects.propublica.org/free-the-files/leaderboard"&gt;leaderboard&lt;/a&gt; so that users could (justifiably) brag about how much they were contributing. They became super-competitive about their spot on the board. During the election-day challenge, we showed users their single-day count right in the yellow bar on each page to help them keep track of their score.&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/nerds/casino-leaderboard.png" alt="The leaderboard." title="" /&gt;&lt;/p&gt;

&lt;p&gt;Users were also able to brag in social media about freeing files &amp;#8212; in a popup, without leaving the casino.&lt;/p&gt;

&lt;p&gt;Casino-driven design was resoundingly successful in our election-related apps. We later used it to help drive participation in our &lt;a href="http://projects.propublica.org/emails/"&gt;Message Machine&lt;/a&gt; project. We&amp;#8217;re excited to keep evolving it when we build crowdsourcing apps.&lt;/p&gt;

&lt;h2 id="looking_ahead"&gt;Looking Ahead&lt;/h2&gt;

&lt;p&gt;Casino-driven Design is all about reducing friction to participation. We showed Free the Files to some MIT computer-science profs and they gave us some interesting ideas we&amp;#8217;re eager to try in future casinos:&lt;/p&gt;

&lt;p&gt;One of the things that made transcribing the ad contracts difficult was that there were a plethora of proprietary page layouts that each station or network of stations used. This meant that users had to hunt around for the same data points in different places on different filings. It&amp;#8217;s possible to cluster similar-looking documents using a technology called &lt;a href="http://en.wikipedia.org/wiki/Computer_vision"&gt;computer vision&lt;/a&gt; and then only show one kind of page to each user. This ought to speed their work up. &lt;/p&gt;

&lt;p&gt;We also plan to experiment with presenting users with a single task &amp;#8212; say, transcribing just the date on a single form design &amp;#8212; and repeating the task over and over again. This may let the unconscious mind take over and speed task completion enormously. Different users would see different elements to transcribe, so we&amp;#8217;d still end up with the same data, just split up more atomically.&lt;/p&gt;

&lt;p&gt;Beyond that, we could also ask users simply to draw boxes around the spot where each data point can be found in these various formats and then write software to look in those places and use OCR to transcribe what it finds. OCR isn&amp;#8217;t a good fit for analyzing entire documents like in Free the Files (especially scanned or faxed pages), but if we have human help to guide the OCR on these boxes, we may eventually be able to grab data out of forms with only human verification.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/vYEdbw5RCU4" height="1" width="1"/&gt;</description>
		<dc:author>Al Shaw</dc:author>
		<dc:subject />
		<dc:date>2013-03-20T12:02:02-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/casino-driven-design/</feedburner:origLink></item>

	<item>
		<title>ProPublica Honored with Best Map, Two Medals at Malofiej 21</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/xQxfXWCo-fI/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/propublica-honored-with-best-map-two-medals-at-malofiej-21/#25606</guid>
		<description>&lt;p&gt;On Friday, the Spanish chapter of the Society of News Design (SNDE) announced the winners of the 21st &lt;a href="http://www.malofiej21.com/"&gt;Malofiej&lt;/a&gt; International Infographics Awards. The jury evaluated over 1,000 entries from 28 countries.&lt;/p&gt;

&lt;p&gt;This year, ProPublica was honored with the Miguel Urabayen Award for best map in the online category for &lt;a href="http://propublica.github.com/stateface/"&gt;StateFace&lt;/a&gt;, an open-source font we created. It&amp;#8217;s made up of U.S. state shapes and is meant to be used as a design element in interactive web apps and graphics. The judges were impressed by StateFace&amp;#8217;s versatility and its availability as a public service. Besides our &lt;a href="http://projects.propublica.org/pipelines/"&gt;own work&lt;/a&gt;, StateFace has been used by &lt;a href="http://elections.npr.org/"&gt;NPR&lt;/a&gt;, &lt;a href="http://www.isbarackobamathepresident.com/"&gt;the Guardian&lt;/a&gt;, the &lt;a href="http://elections.huffingtonpost.com/2012/romney-vs-obama-electoral-map"&gt;Huffington Post&lt;/a&gt; and many others, especially in graphics and applications about the 2012 election. ProPublica also received a silver medal in the Innovative Format category for StateFace, and a bronze medal in Online Features/World and Nation for &lt;a href="http://projects.propublica.org/graphics/city-maps"&gt;Housing Segregation: The Great Migration and Beyond&lt;/a&gt; by Jeff Larson and Nikole Hannah-Jones.&lt;/p&gt;

&lt;p&gt;The Malofiej awards have been called the &amp;#8220;&lt;a href="http://www.malofiej21.com/about/"&gt;Pulitzers for Infographics&lt;/a&gt;.&amp;#8221; We&amp;#8217;re extremely pleased to have been honored with three awards in the 2013 competion. Congrats to all of the &lt;a href="http://www.malofiej21.com/wp-content/uploads/2013/03/M21-Premios-Lista-OK2.pdf"&gt;winners&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;Read our &lt;a href="http://www.propublica.org/nerds/item/outsider-car-quick-thoughts-on-malofiej-21-day-1/"&gt;Day 1&lt;/a&gt; and &lt;a href="http://www.propublica.org/nerds/item/between-human-and-machine-thoughts-on-malofiej-21-day-2/"&gt;Day 2&lt;/a&gt; impressions of the Malofiej World Infographic Summit in Pamplona, Spain.&lt;/em&gt;&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/xQxfXWCo-fI" height="1" width="1"/&gt;</description>
		<dc:author>ProPublica</dc:author>
		<dc:subject />
		<dc:date>2013-03-18T11:03:05-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/propublica-honored-with-best-map-two-medals-at-malofiej-21/</feedburner:origLink></item>

	<item>
		<title>Between Human and Machine: Thoughts on Malofiej 21 Day 2</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/zL2iuUzE1Yg/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/between-human-and-machine-thoughts-on-malofiej-21-day-2/#25605</guid>
		<description>&lt;p&gt;&lt;em&gt;This is the second of two posts about the 21st Malofiej World Infographic Summit. &lt;a href="http://www.propublica.org/nerds/item/outsider-car-quick-thoughts-on-malofiej-21-day-1/"&gt;Read the first here&lt;/a&gt;.&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;In one of the final sessions of the &lt;a href="http://www.malofiej21.com/malofiej-world-summit-program/"&gt;21st Malofiej World Infographic Summit&lt;/a&gt; on Friday in Pamplona, Spain, Nigel Holmes noted the common theme of &amp;#8220;hands&amp;#8221; in the week&amp;#8217;s presentations.&lt;/p&gt;

&lt;p&gt;&amp;#8220;We are humans, we are not attached to machines,&amp;#8221; Holmes said as he presented a tongue-in-cheek look at  &amp;#8220;non-informational art&amp;#8221; &amp;#8212; fine art made by graphic journalists and artists when they weren&amp;#8217;t making infographics.&lt;/p&gt;

&lt;p&gt;As the conference was coming to a close, Holmes&amp;#8217; drawing attention to hands struck me as particularly insightful. One of the main undercurrents during the week was the tension between illustrative graphics, often borne out of photographs and sketches, and data visualization, whose raw material is often the very machine-based spreadsheet.&lt;/p&gt;

&lt;p&gt;The presentations on Day 2 almost seemed to alternate between these two styles of information graphics in turn. The day started off with Juan Colombato from La Voz del Interior in C&amp;#243;rdoba, Argentina, who explained his methods of creating graphics, starting with sketching, photographs, tracing Google Earth, research and reporting with experts.&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/malofiej21/colombato-google-earth-630.jpg" alt="Juan Colombato uses Google Earth as reference material for illustrative graphics"&gt;&lt;/p&gt;

&lt;p&gt;The themes in Colombato&amp;#8217;s presentation were echoed in those from Amanda Hobbs (the former Art Research Editor at National Geographic magazine) and Jen Christiansen of Scientific American.&lt;/p&gt;

&lt;p&gt;In contrast were the presentations of Jan Willem Tulp, and Fr&amp;#233;d&amp;#233;rik Ruys, who spoke of the process behind creating &lt;em&gt;data&lt;/em&gt;-based interactive and motion graphics.&lt;/p&gt;

&lt;p&gt;What struck me about all the Day 2 speakers was the emphasis on the data collection and analysis process, something you don&amp;#8217;t expect to hear much about at a conference dedicated to presentation. The Malofiej awards are judged on effectively conveying information graphically, not necessarily on how the information was gathered or analyzed. Nevertheless, process was a big part of the lectures, and it was fascinating to hear the different approaches taken by different publications. Both the illustration-graphics and data-visualization camps shared similar methods.&lt;/p&gt;

&lt;p&gt;Hobbs couldn&amp;#8217;t stress enough that the illustrations in National Geographic were not artistic renderings, but rather reported works based on extensive consultation with experts. Hundreds of emails and sketches (some on Post-Its) of proposed graphical bits were exchanged in pursuit of accuracy. Likewise, Colombato said that in order to cut through the jargon one usually hears from experts, he would often ask how they would draw something if they had to.&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/malofiej21/hobbs-verify-via-postit-630.jpg" alt="Hobbs exchanged notes with experts on illustrations, sometimes via Post-It"&gt;&lt;/p&gt;

&lt;p&gt;Ruys, a journalist and designer focused on motion graphics, and Tulp, a freelance interactive developer and designer, talked about using data as a primary source, and the struggles to clean the data as well as to find stories in it. When describing one of his sources, a database of shipping routes, Ruys showed an example of dirty data in the &amp;#8220;destination&amp;#8221; column. Certain captains would leave destinations blank, and others would even try to be funny, entering values like &amp;#8220;pirate bay,&amp;#8221; or &amp;#8220;wherever there is water.&amp;#8221; He ended up throwing out that data point because it was too messy. When looking at the shipping data for an animated sequence in a documentary called &amp;#8220;&lt;a href="http://nederlandvanboven.vpro.nl/afleveringen/overzicht.html"&gt;Netherlands from Above&lt;/a&gt;,&amp;#8221; he also spoke of finding stories in spreadsheets. In the end, he decided to focus on the journey of one ship rather than showing the whole set.&lt;/p&gt;

&lt;p&gt;Tulp analyzed Dutch elections data for a project called &lt;a href="http://tulpinteractive.com/close-votes/"&gt;Close Votes&lt;/a&gt; because he was curious whether certain cities in the Netherlands voted alike. Through his analysis and resulting data visualization, he discovered that the Dutch &amp;#8220;bible belt&amp;#8221; all voted similarly, and even discovered what may be the most religious city in the Netherlands based on its extreme voting patterns. While he could have done more to highlight the interesting findings (the graphic was, as we say, &lt;a href="https://github.com/propublica/guides/blob/master/design-structure.md"&gt;all near, no far&lt;/a&gt;), it was a good example of data visualization as generative storytelling &amp;#8212; graphics that actually generate stories themselves.&lt;/p&gt;

&lt;p&gt;The tension between illustration graphics and data visualizations also dovetails with another axis at Malofiej: representative and figurative work versus abstract and geometric presentation of findings. &lt;/p&gt;

&lt;p&gt;The best example of this split was a story Jen Christiansen told of a graphic she was developing on the expansion of the universe for Scientific American. Having been inspired by a 1998 minimalist scientific graphic by Bryan Christie, she set out to explain the phenomenon with a very elegant graphic of continually expanding circles (the observable universe) within a cone. After speaking with editors, she gradually came around to the idea of adding more representational detail, such as making the circles spherical and adding small illustrations of space phenomena. While the colors in the initial version were bright and primary, the &lt;a href="http://www.nature.com/scientificamerican/journal/v304/n4/box/scientificamerican0411-36_BX1.html"&gt;final graphic&lt;/a&gt; took on the traditional look of space graphics &amp;#8212; black background with spheres. After seeing the final product, I finally understood what the graphic was explaining. Christiansen called these trappings a &amp;#8220;welcoming gesture,&amp;#8221; and a &amp;#8220;glass of wine before a challenging plate of tripe.&amp;#8221; Easing readers into data through representation is something those on the pure data-visualization side can learn a lot from.&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/malofiej21/bryan-christie-98-630.jpg"&gt;&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/malofiej21/christiansen-graphic-before-630.jpg"&gt;&lt;/p&gt;

&lt;p&gt;&lt;img src="http://propublica.s3.amazonaws.com/assets/malofiej21/christiansen-graphic-iteration-630.jpg" alt=" Jen Christiansen was inspired by a minimalist 1998 scientific illustration by Bryan Christie, but when iterating on the design, realized that a more representative design was easier for readers to understand."&gt;&lt;/p&gt;

&lt;p&gt;These axes were a useful lens through which to understand where the Malofiej contest judges gave awards. The New York Times &amp;#8212; especially Archie Tse&amp;#8217;s work, which won a Gold medal for Individual Portfolio &amp;#8212; shows a mastery of both axes: illustration-based, representational explanations of how various Olympians win at their sports, alongside abstract (to the point of austere) shoreline-free flood maps for Hurricane Sandy coverage. His work is based on sketches and reporting, and it&amp;#8217;s based on massive data sets. The Malofiej judges looked for mastery along both axes.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;&lt;strong&gt;Correction, March 18, 2013&lt;/strong&gt;: Jen Christiansen works for Scientific American, not American Scientist. The spheres in her graphic represent the expansion of the observable universe, not discrete planets.&lt;/em&gt;&lt;/p&gt;
&lt;p&gt;&lt;em&gt;&lt;strong&gt;Correction, March 19, 2013&lt;/strong&gt;: Amanda Hobbs was formerly the Art Research Editor, not an Art Director, at National Geographic magazine. The role of Post-Its in her work has been clarified.&lt;/em&gt;&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/zL2iuUzE1Yg" height="1" width="1"/&gt;</description>
		<dc:author>Al Shaw</dc:author>
		<dc:subject />
		<dc:date>2013-03-18T10:32:54-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/between-human-and-machine-thoughts-on-malofiej-21-day-2/</feedburner:origLink></item>

	<item>
		<title>Outsider CAR: Quick Thoughts on Malofiej 21 Day 1</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/xYHPdiVqW3w/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/outsider-car-quick-thoughts-on-malofiej-21-day-1/#25603</guid>
		<description>&lt;p&gt;&lt;em&gt;This is the first of two posts about the 21st Malofiej World Infographic Summit. &lt;a href="http://www.propublica.org/nerds/item/between-human-and-machine-thoughts-on-malofiej-21-day-2"&gt;Read the second here&lt;/a&gt;.&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;Today was the first day of the &lt;a href="http://www.malofiej21.com/malofiej-world-summit-program/"&gt;Malofiej World Infographic Summit&lt;/a&gt; in Pamplona, Spain. It&amp;#8217;s a very small (the whole attendee list fits on two double sided sheets of paper) two day one-track conference. The Malofiej awards are highly geared towards news graphics (mostly print, but more so on the web), so naturally I thought the conference talks would be, too. But there was a broad spectrum of speakers. The big guys were there (Wilson Andrews of the Washington Post and Graham Roberts of The New York Times), and art directors from magazines in Brazil and Russia, but also a few, for lack of a better term, data artists. &lt;/p&gt;

&lt;p&gt;Two talks, in particular, stuck out to me as examples of what I can only describe as &amp;#8220;outsider CAR.&amp;#8221; Only after I &lt;a href="https://twitter.com/A_L/status/312142828266475520"&gt;tweeted&lt;/a&gt; this, did I realize the coinage.&lt;/p&gt;

&lt;p&gt;But, here&amp;#8217;s what I mean. Nicholas Felton &amp;#8212; creator of the infamous &lt;a href="http://feltron.com/ar12_01.html"&gt;Feltron Reports&lt;/a&gt; &amp;#8212; discussed both his methodology and production strategies for the report in general, but specifically focussing on the 2012 report. What struck me about his presentation was that he was using the vocabulary of Computer Assisted Reporting, but was using data collection, analysis and presentation methods he had come up with himself. Obviously, the Feltron Report isn&amp;#8217;t a work of journalism. But it&amp;#8217;s close. It actually could be, if he took the data one step further and found and presented stories in it. But everything from &amp;#8220;sampling&amp;#8221; (filling out surveys in a custom iPhone app that pings him randomly) to the final product (often using stuff like multi-slice pie charts, force directed diagrams, and &amp;#8220;ego-centric&amp;#8221; cartography wherein maps remove all points of places he hadn&amp;#8217;t been) are things that would never be allowed in a news environment. Felton said that 2012 was the first year he started &amp;#8220;sketching in code&amp;#8221; using Processing. In previous years, he would start out in Illustrator to create, say, a pie chart, and then circle back and fill in the data later. I&amp;#8217;ll be curious to see whether next year, he veers more towards journalistic methods, say, bringing the data into R and looking for trends and correlations that could then be &amp;#8220;reported&amp;#8221; on. Or, maybe doing a third party edition to get a dispassionate report, rather than relying on &amp;#8220;sampling&amp;#8221; that is probably swayed by his own view of what will turn up in the product.&lt;/p&gt;

&lt;p&gt;Another talk, by &lt;a href="http://www.itsbeenreal.co.uk/"&gt;Stefanie Posavec&lt;/a&gt;, a freelance &amp;#8220;data-related designer&amp;#8221; (her term) focused on a bunch of work, most notably her gorgeous diffs of &lt;a href="http://www.itsbeenreal.co.uk/index.php?/on-going/about/"&gt;Darwin&amp;#8217;s Origin of Species editions&lt;/a&gt;. Late in the talk, discussed a new project she&amp;#8217;s working on for the V&amp;amp;A Museum, and in particular, a map she made of triangles between country capitals. She made the map by tracing Google Earth and bringing it into illustrator. While, obviously this also isn&amp;#8217;t journalism, I think it&amp;#8217;s another example of &amp;#8220;outsider CAR.&amp;#8221; In a news environment, she would be invariably steered towards GIS software, but instead she came to her own cartographic methodology. When an audience member asked why she didn&amp;#8217;t learn to code, she joked about her own laziness but also said it&amp;#8217;s kind of like the difference between a hand-knitted and machine-knitted sweater. As a purveyor of bespoke artisanal data, I object a bit to that, but I wonder what the piece would have looked like if she had, say generated the triangulated map in PostGIS. It would have been easier, but it would have probably come out a lot differently. People have quite different conceptions of laziness!&lt;/p&gt;

&lt;p&gt;I&amp;#8217;m interested to see what tomorrow&amp;#8217;s talks bring. It&amp;#8217;s an interesting conference, and I think those of us who are often directed towards the &lt;em&gt;right&lt;/em&gt; way to do visualizations, cartography, data collection, analysis and cleaning in our newsrooms and at conferences like NICAR, can learn from outsiders who are hand crafting data according to their own rules.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/xYHPdiVqW3w" height="1" width="1"/&gt;</description>
		<dc:author>Al Shaw</dc:author>
		<dc:subject />
		<dc:date>2013-03-14T20:28:01-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/outsider-car-quick-thoughts-on-malofiej-21-day-1/</feedburner:origLink></item>

	<item>
		<title>Casey Thomas, P5 Resident</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/DQWzc_y43HY/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/casey-thomas-p5-resident/#25602</guid>
		<description>&lt;p&gt;The fourth P5 Resident started a project in the ProPublica offices today. He&amp;#8217;s Casey Thomas from &lt;a href="http://www.axisphilly.org" title="http://www.axisphilly.org"&gt;AxisPhilly&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;AxisPhilly is a non-profit news startup in Philadelphia. Their mission is to &amp;#8220;educate and engage citizens on topics of public interest while empowering them with tools to participate in developing and implementing change.&amp;#8221; A big part of their mission is interactive news applications. In a newsroom of nine people, AxisPhilly has two news apps developers and a freelancer who works on data projects. Casey&amp;#8217;s projects at AxisPhilly include a map of the &lt;a href="http://apps.axisphilly.org/deadbeat-neighbors/" title="http://apps.axisphilly.org/deadbeat-neighbors/"&gt;effects of delinquent properties&lt;/a&gt; on the market value of nearby homes, as well as a map projecting the &lt;a href="http://apps.axisphilly.org/avi-map/" title="http://apps.axisphilly.org/avi-map/"&gt;property tax changes&lt;/a&gt; at each address in the city.&lt;/p&gt;

&lt;p&gt;Casey&amp;#8217;s project at ProPublica is about the 23 schools slated to close in Philadelphia at the end of the school year. More details when it launches very soon!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/DQWzc_y43HY" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-03-14T17:47:00-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/casey-thomas-p5-resident/</feedburner:origLink></item>

	<item>
		<title>Everything You’ve Ever Wanted to Know About Our News App Tech</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/jE29WUXVR8I/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/our-news-app-tech/#25550</guid>
		<description>&lt;p&gt;As newsrooms incorporate news application teams, one of the first questions they have to answer for themselves is what technologies to choose, and how to set up developer and web hosting environments that are sane and tuned to serve news apps. As part of her &lt;a href="http://www.propublica.org/about/p5-program"&gt;P5 Residency&lt;/a&gt;, developer Peggy Bustamante from Digital First Media's &lt;a href="http://www.insidethunderdome.com/"&gt;Project Thunderdome&lt;/a&gt; spent a few days mapping ProPublica's infrastructure. She's &lt;a href="http://www.propublica.org/nerds/item/other-ways-to-serve-an-app"&gt;written a post&lt;/a&gt; that lays out some alternatives that came out of a discussion on the NICAR-L mailing list, and the answers that the Thunderdome team came up with.&lt;/p&gt;

&lt;p&gt;What follows is ProPublica's advice on developer and server setups for news apps teams.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="step-one-pick-a-server-side-programming-language-and-stick-to-it" class="anchor" href="#step-one-pick-a-server-side-programming-language-and-stick-to-it"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Step One: Pick a Server-Side Programming Language and Stick to It.&lt;/h2&gt;

&lt;p&gt;One of the early decisions you should make is which programming language and which development framework to use. Two prominent options are &lt;a href="http://rubyonrails.org/"&gt;Ruby on Rails&lt;/a&gt; and &lt;a href="https://www.djangoproject.com/"&gt;Python/Django&lt;/a&gt;. There are other decent options like &lt;a href="http://laravel.com/"&gt;PHP/Laravel&lt;/a&gt; and even the &lt;a href="http://www.asp.net/mvc"&gt;Microsoft stack&lt;/a&gt; but if you want to share server-side code with other newsroom nerds (and you do) you're best off choosing between Ruby and Python. &lt;/p&gt;

&lt;p&gt;The news apps world is split somewhere down the middle between Rails developers and Django developers, with the Rails developers seemingly concentrated on the island of Manhattan. Both frameworks really great, and they both inspire a lot of pride and trash talking among their adherents. Which one to pick is predominantly a personal preference. If you and your developers like coding in Ruby, use Rails. If they like coding in Python, use Django. Neither is an objectively correct choice. There are newsrooms who have built apps in .NET and in PHP, too. Within a certain bounds of sanity, my rule is that if you're going to be writing in a language all day every day for years, you might as well pick one you enjoy, and for which there's a decent chance you'll be able to hire experienced developers. &lt;/p&gt;

&lt;p&gt;An important caveat: The one thing you shouldn't do is pick a language solely because of corporate standards or because "&lt;a href="http://hackshackers.com/blog/2010/04/13/dont-mistake-your-cms-for-a-development-platform/"&gt;it's what the CMS is written in&lt;/a&gt;." News apps are a new kind of development for newsrooms, and they require a new, more nimble way of thinking. The benefits of corporate standardization are real, but they shouldn't dictate the tools developer/journalists use any more than business-side standards dictate how we report the news.&lt;/p&gt;

&lt;p&gt;Once you've picked the language and framework, do your best to stick to them. Trying out new languages and approaches for each project increases your &lt;a href="http://en.wikipedia.org/wiki/Technical_debt"&gt;technical debt&lt;/a&gt; and complicates your life needlessly, and makes hiring more difficult.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="step-two-assemble-a-team" class="anchor" href="#step-two-assemble-a-team"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Step Two: Assemble a Team&lt;/h2&gt;

&lt;p&gt;The news app cube at ProPublica is currently eight people, but don't think you can't get started until you've got a budget that big. We started out much smaller (at first just me, then three of us for a long while), and many small teams around our industry make incredible news apps. &lt;/p&gt;

&lt;p&gt;I often hear when I talk to newsrooms trying to assemble teams that there is no good, affordable dev talent in their area. My response is always the same: They're looking in the wrong places. Here's why:&lt;/p&gt;

&lt;p&gt;The talents needed to be a great news app developer form a three-legged stool. The first is the ability to do journalism -- The imperative to get the facts right, "editorial judgment," etc. The second leg is enough design acumen to make an interactive presentation that people understand and that tells a coherent "story." The third leg is the ability to write code quickly. I would argue that this third leg is the most teachable of the three. The fact is, the aforementioned development frameworks do a lot of the real heavy lifting, code-wise; the kinds of apps we create -- where the data is complex but the interactions quite simple -- tend not to need truly high-performance dynamic code, and server caches like &lt;a href="https://www.varnish-cache.org/"&gt;Varnish&lt;/a&gt; make even slightly inefficient code fast enough to handle enormous traffic.&lt;/p&gt;

&lt;p&gt;So while you may think that you need a developer with a CS degree, and you may think you're competing with Google for talent, in fact the first place you should be looking for talent is your own newsroom. Most newsrooms I know have graphics editors and designers who have been secretly writing website scrapers and using graphics frameworks like &lt;a href="http://d3js.org/"&gt;D3&lt;/a&gt; for years. Graphics people have the journalism chops and the design acumen to make outstanding news apps and often just need to be given the space and time, and maybe &lt;a href="https://peepcode.com/"&gt;a little training&lt;/a&gt;, to step into the role of news app developer. Then, as time goes on, you'll find that their success grows better than linearly as you add resources. &lt;/p&gt;

&lt;p&gt;It's also very important that the people on your news app teams think of themselves as journalists. They should file FOIAs and make reporting calls. If they're working with a traditional reporter they should read drafts and see outlines and story memos right from the beginning. Most importantly they should report to an editor and not the IT department. IT plays an important role in your newsroom but news application developers are much more like reporters than they are like corporate devs. They should be managed the right way.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="step-three-outfit-your-team" class="anchor" href="#step-three-outfit-your-team"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Step Three: Outfit Your Team&lt;/h2&gt;

&lt;p&gt;At ProPublica, we're all on Macs, because the Mac OS's unix-like underpinnings make it a pretty natural environment to develop code for deployment on Linux servers, and its popularity makes it easy to integrate it into corporate email systems, to buy Photoshop for it, etc. We know other developers who happily use Linux on their dev workstations. It wouldn't surprise me to hear devs out there using Windows, though I'll leave as an exercise for the reader how to up Windows for web development.&lt;/p&gt;

&lt;p&gt;We each have two monitors: One is a nice bright, sharp iMac screen which is useful when you're staring at a screen for 8-10 hours a day. The second is one we call a CDM, a low-end "cheap Dell monitor" which is useful to look at pages as we're building them. It's a worst-case scenario end-user setup. You'll find that some designs that look great on a beautiful Apple screen will seem washed out and almost-invisible on the CDM.&lt;/p&gt;

&lt;p&gt;We do browser testing using several virtual machines, one for each supported Internet Explorer. We support IE8 and up, though you should decide what browsers to support for yourselves after taking a good look at your analytics. All the virtual machines run on a server in our server closet. Until recently we used the free version of VMWare Server and existing Windows licenses, so the setup costs were pretty minimal. We connect to the VMs Microsoft's Remote Desktop Connector rather than running virtualization software on our computers. Another option is to use the Microsoft-provided virtual machines, which can be installed free of charge with &lt;a href="https://github.com/xdissent/ievms"&gt;ievms&lt;/a&gt;.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="step-four-organize-your-team" class="anchor" href="#step-four-organize-your-team"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Step Four: Organize Your Team&lt;/h2&gt;

&lt;p&gt;While we frequently collaborate on news apps, we don't take the approach that the entire team works on one project at a time. One of us takes the lead on each project (you can tell who was the lead on a given project by looking at the bylines on the app: the lead developer is typically the first). We find that having one person who can take the time to read drafts, collaborate with a reporter, do any necessary research and reporting, and in general have the responsibility and vision for an app is really useful. Big scrums, while often very useful, can end up breaking up projects into such tiny chunks that journalistic opportunities are lost. Somebody doing data gathering and analysis should be thinking of story possibilities, and the designer should be very well steeped in the vagaries of the data, and the backend and front-end development work often flow together so much that a single developer should do them both anyway.&lt;/p&gt;

&lt;p&gt;Naturally, not everybody is a true polymath and can do design, back-end code and journalism equally well. But each of us has strengths and we help each other. The stronger designers help the stronger coders, who help the stronger reporters.&lt;/p&gt;

&lt;p&gt;ProPublica puts all of its code into source code management, which is an absolute requirement for development teams (and even solo developers). We use git. There are others, but the competition is essentially over and git won.&lt;/p&gt;

&lt;p&gt;Using git enables us to work together on projects without clobbering each other&amp;#8217;s work, to keep track of our changes and to easily roll back to previous versions when we need to.&lt;/p&gt;

&lt;p&gt;While we're developing apps, we run them locally using Rails' built-in server. That lets us test and experiment with things outside production so there&amp;#8217;s no risk of bugs or nonpublic material appearing on the Internet. We pass URLs to each other and to the wider newsroom using special local DNS records, so sending a test URL to a colleague is as easy as sending any other kind of URL, though of course they can only connect to our local apps while inside our offices.&lt;/p&gt;

&lt;p&gt;Rails apps are pretty self-contained so this setup is pretty easy to manage. Some other teams use virtual machines to ensure a closer match between developer workstation and production server, but our system has suited us fine so far.&lt;/p&gt;

&lt;p&gt;In our news applications, the data is often as finely honed as an artisanal cheese, so we tend to check even our data sets into version control, so we can version them and to make deploys easier. &lt;/p&gt;

&lt;p&gt;When a ProPublica news app is ready to deploy, we use &lt;a href="http://capistranorb.com/"&gt;Capistrano&lt;/a&gt; to send our apps up to a production server. Capistrano is a ruby-based system (the Pythonic equivalent is called &lt;a href="http://docs.fabfile.org/en/1.5/"&gt;Fabric&lt;/a&gt;). It lets you specify exactly how your app should be deployed -- where on the server your apps are located, any web server config changes that need to be made to accommodate the code, and any special commands that need to be executed at any point in the deployment -- like cache invalidation, database re-seeding, restarting the app, etc.&lt;/p&gt;

&lt;p&gt;Our deployment recipe is pretty specific to us, but Capistrano makes it easy to automate pretty much everything about your deployment strategy. It even backs up previous versions of your apps so it's dead simple to roll back in case something goes wrong with a deploy. If you want to enable less-technical people to execute some aspects of your deployment system you can explore &lt;a href="https://github.com/peritor/webistrano"&gt;Webistrano&lt;/a&gt;, which is sort of a web-enabled version of Capistrano. &lt;/p&gt;

&lt;h2&gt;
&lt;a name="step-five-set-up-your-server-environment" class="anchor" href="#step-five-set-up-your-server-environment"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Step Five: Set Up Your Server Environment&lt;/h2&gt;

&lt;p&gt;One of the most important choices you'll make is your server environment. I'm going to talk about Linux and as you're still reading I'm going to assume you've already decided on using Linux. &lt;/p&gt;

&lt;p&gt;There are a bunch of options for web hosting but I'll talk about two: Real-server hosting and the cloud.&lt;/p&gt;

&lt;p&gt;With real servers, you may have the advantage of existing IT infrastructure. You may already have a server room and tons of bandwidth, and adding a few web servers would only be an incremental cost. But adding servers is a slow enough process that scaling out to meet temporary demand (say, adding ten servers on election day that you don't need anymore the day after) can attenuate the benefits of self-hosting. Also, in my experience IT departments want a say over what gets installed on servers they're expected to maintain, and depending on your IT department that might introduce cross-departmental management problems.&lt;/p&gt;

&lt;p&gt;Cloud hosting gives you incredible flexibility when you need on-demand server instances, quick scale-outs, rapid OS migrations, etc. They're not as fast as real servers, and the costs can scale quickly relative to real servers. A low-end cloud server can sometimes cost only a few dollars or less a day (you pay by the hour), but a cloud servers can get expensive on the high end. Cloud is best when flexibility is more important than raw performance. In terms of security, cloud and real servers are a tie.&lt;/p&gt;

&lt;p&gt;ProPublica&amp;#8217;s production server environment uses &lt;a href="http://aws.amazon.com"&gt;Amazon Web Services&lt;/a&gt;. We use a fairly plain vanilla three-tier architecture with a cache server in front of a few application servers in front of database servers. Some of our databases are hosted using Amazon's face-meltingly cool &lt;a href="http://aws.amazon.com/rds/"&gt;Relational Database Service&lt;/a&gt;. RDS runs just like a MySQL for us. It can also be a drop-in replacement for Oracle and SQL Server, though we don't need those. RDS does not support PostgreSQL, so for apps that require it we run it on a real EC2 instance.&lt;/p&gt;

&lt;p&gt;For cache we run Varnish, which is a very high performing write-through cache. Varnish makes it so that each unique request only hits our backend servers once. It's so insanely fast that we once had an app get 2 million page views in a few hours and our CPU load average barely budged off of zero. I'm pretty sure Varnish contains alien technology. &lt;/p&gt;

&lt;p&gt;Behind our Varnish box are several app servers that run the news apps projects. They are not directly accessible over the Internet.&lt;/p&gt;

&lt;p&gt;Incidentally, ProPublica's systems started much more simply -- we started out with just two -- even from the beginning you'll want to keep your database on a separate server. For a long time we only had two Amazon servers, and we served a lot of traffic with just that. Our current setup can scale out incredibly easily by adding more app servers.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="alternatives" class="anchor" href="#alternatives"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Alternatives&lt;/h2&gt;

&lt;p&gt;&lt;a href="http://www.propublica.org/nerds/item/other-ways-to-serve-an-app"&gt;Peggy's post&lt;/a&gt; talks about alternatives to doing things this way, and there are lots. In addition to using real servers instead of the cloud, some teams "bake out" their work using dedicated computers and then upload the baked-out static files to a service like &lt;a href="http://aws.amazon.com/s3"&gt;Amazon's S3&lt;/a&gt;. Static files can't crash and they can handle &lt;a href="http://blog.apps.npr.org/2013/02/14/app-template-redux.html"&gt;insane amounts of traffic&lt;/a&gt;. But there are big tradeoffs. If you need to store user input, or if you have lots and lots of possible application end points, or if you want to let users search for arbitrary terms, baking out might not be the best option.&lt;/p&gt;

&lt;p&gt;There's lots more to it but this should get you started. If you have questions, the best place to ask them is the &lt;a href="http://www.ire.org/resource-center/listservs/subscribe-nicar-l/"&gt;NICAR-L email list&lt;/a&gt;. News app developers tend to lurk on that list.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/jE29WUXVR8I" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-02-27T11:14:54-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/our-news-app-tech/</feedburner:origLink></item>

	<item>
		<title>P5 Resident Researches Other Ways to Serve an App</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/ptpWaUjgnIw/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/other-ways-to-serve-an-app/#25433</guid>
		<description>&lt;p&gt;&lt;em&gt;Peggy Bustamante is a news app developer with &lt;a href="http://www.digitalfirstmedia.com/"&gt;Digital First Media&amp;#8217;s&lt;/a&gt; Data Team, and was the &lt;a href="http://www.propublica.org/about/p5-project"&gt;P5 Resident&lt;/a&gt; at ProPublica in January. She spent her time working with ProPublica News App Editor Scott Klein on mapping ProPublica's tech setup. Scott wrote a &lt;a href="http://www.propublica.org/nerds/item/our-news-app-tech"&gt;blog post about ProPublica's setup&lt;/a&gt;, and Peggy wrote this post, about alternative scenarios and DFM's own approach.&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;The first thing you want to do when you join a news apps team is build cool projects.&lt;/p&gt;

&lt;p&gt;But if you are a new outfit, as we are at Digital First Media&amp;#8217;s &lt;a href="http://www.insidethunderdome.com/"&gt;Thunderdome&lt;/a&gt; data team, there is one big step before you can get to that happy place of creativity and news: you have to set up an environment to build those cool projects.&lt;/p&gt;

&lt;p&gt;My visit to ProPublica as a P5 fellow last month coincided with the crucial moment of our teams&amp;#8217; deciding how that development environment would be configured. On my first day, Scott Klein and I discussed ProPublica&amp;#8217;s set up and what some other news orgs had chosen. It spawned a rather hearty discussion on &lt;a href="http://www.ire.org/resource-center/listservs/subscribe-nicar-l/"&gt;NICAR-L&lt;/a&gt;. When the dust settled, we found some commonality among news devs and a variety of viable options.&lt;/p&gt;

&lt;p&gt;Almost without exception, news apps teams are on cloud servers, not confined to building projects within the constraints of their news organization&amp;#8217;s CMS. All agree, their work could not be done if they didn&amp;#8217;t have a separate development environment.&lt;/p&gt;

&lt;p&gt;While news app teams use a wide variety of languages, most use either Python/Django or Ruby on Rails. ProPublica and the New York Times use Rails. NPR and L.A. Times use Python/Django. DFM&amp;#8217;s Data Team is starting with PHP because our developers have extensive experience building projects with it. In the coming months, we will be transitioning into Python/Django. We also use JavaScript and JSON feeds to build a variety of data-driven projects, such as our &lt;a href="http://extras.twincities.com/elections/predictor4x/index.html"&gt;NFL playoff predictor&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;ProPublica&amp;#8217;s server setup, where the discussion began, is what I imagine is fairly standard, one that we as a team at DFM had been considering. It consists of a Varnish server in front of two development servers that are connected to database servers. ProPublica uses Amazon Web Services and Amazon&amp;#8217;s Relational Database Service. They also have a second server for PostgreSQL because RDS does not support PostgreSQL. Lots more details on ProPublica's setup is over at &lt;a href="/propublica/nerdblog/blob/master/posts/tk"&gt;Scott's post&lt;/a&gt;.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="the-heroku-option" class="anchor" href="#the-heroku-option"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;The Heroku Option&lt;/h2&gt;

&lt;p&gt;In the NICAR-L discussion, some recommended &lt;a href="http://www.heroku.com"&gt;Heroku&lt;/a&gt;, especially Chase Davis, formerly of the Center for Investigative Reporting, and Ryan McNeil at Thomson Reuters. Chase Davis' &lt;a href="http://cironline.org/blog/post/heroku-news-apps-3415"&gt;blog post on Heroku&lt;/a&gt; is quite useful. Although originally only for Ruby on Rails, Heroku added Python support in late 2011.&lt;/p&gt;

&lt;p&gt;One big advantage of Heroku is that there is no server configuration, as there is with AWS, and it&amp;#8217;s easy to increase and decrease server capacity depending on a news app&amp;#8217;s traffic.&lt;/p&gt;

&lt;p&gt;On the downside, although deploying news apps is greatly simplified, Heroku can get expensive fairly quickly. It was generally agreed that Heroku is a great solution for smaller apps, but not for large-scale apps. A large part of the expense comes from using Heroku&amp;#8217;s databases. CIR avoided that pitfall by using Heroku servers and putting databases on their Amazon EC2 box.&lt;/p&gt;

&lt;p&gt;Heroku's costs can also be kept down by keeping the amount of data a news app uses to a minimum and putting in a little more effort with caching and configuration. That would allow an app with modest traffic to fit within the free account for a while.&lt;/p&gt;

&lt;p&gt;But for news apps teams shooting for more complex apps with larger databases, another solution is...&lt;/p&gt;

&lt;h2&gt;
&lt;a name="cooking-up-apps-by-baking" class="anchor" href="#cooking-up-apps-by-baking"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;Cooking Up Apps By &amp;#8220;Baking&amp;#8221;&lt;/h2&gt;

&lt;p&gt;The &amp;#8220;baking&amp;#8221; option, which is gaining ground, involves outputting or &amp;#8220;baking&amp;#8221; all the possible pages of a project to flat HTML files and then serving them from Amazon S3. The L.A. Times and NPR use this technique extensively. The advantages of this approach are that it is extremely inexpensive, and spikes in traffic aren&amp;#8217;t a problem. And because there is no user input, security is much less of an issue.&lt;/p&gt;

&lt;p&gt;Scott Klein, however, points out some of the disadvantages of &amp;#8220;baking&amp;#8221;:&lt;/p&gt;

&lt;blockquote&gt; &lt;em&gt;   Just to play devil's advocate, if your app doesn't have search or store user input, the whole thing will be in Varnish's cache (or your favorite framework's page cache) almost immediately anyway, so the resilience and performance benefits of baking out might be less than you think. And if you've got an app with lots of possible end points (say, millions of doctor payments or a national slippy map with 16 levels of zoom) the complexity tradeoff in baking out pages goes under water pretty fast. When you find an error in your map you don't want to wait three hours while your bake-out finishes to fix it.&lt;/em&gt;
&lt;/blockquote&gt;

&lt;p&gt;The current rule seems to be whenever you can, bake it flat. Up to 80 percent of the projects at NPR are served this way.&lt;/p&gt;

&lt;h2&gt;
&lt;a name="what-dfms-data-team-ended-up-with" class="anchor" href="#what-dfms-data-team-ended-up-with"&gt;&lt;span class="mini-icon mini-icon-link"&gt;&lt;/span&gt;&lt;/a&gt;What DFM&amp;#8217;s Data Team Ended Up With&lt;/h2&gt;

&lt;p&gt;After much discussion and consideration, the DFM Data Team settled on our own version of all of the above.&lt;/p&gt;

&lt;p&gt;As with other news apps teams, our development environment is separate from the CMS. We are also in &amp;#8220;The Cloud,&amp;#8221; albeit an internal cloud here at Digital First Media, which makes sense for us because the company has dozens of news entities that we serve.&lt;/p&gt;

&lt;p&gt;Our configuration has load balancers in front of the database and production servers, which are fed by an identical staging server for testing and sharing works in progress, and a separate development server.&lt;/p&gt;

&lt;p&gt;We have also decided to &amp;#8220;bake&amp;#8221; projects whenever possible, which will be even easier as we move to Python, so we can explore Ben Welch&amp;#8217;s delicious &lt;a href="http://datadesk.latimes.com/posts/2012/03/introducing-django-bakery/"&gt;Django Bakery&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;And now we can build cool projects.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/ptpWaUjgnIw" height="1" width="1"/&gt;</description>
		<dc:author>ProPublica</dc:author>
		<dc:subject />
		<dc:date>2013-02-27T11:13:59-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/other-ways-to-serve-an-app/</feedburner:origLink></item>

	<item>
		<title>ProPublica’s News Apps and Data Guides</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/okUw8Shyh_g/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/propublicas-news-app-guides/#25553</guid>
		<description>&lt;p&gt;Today we're publishing a series of guides that we hope will be useful for news app teams everywhere: A &lt;a href="https://github.com/propublica/guides/blob/master/news-apps.md"&gt;News App style guide&lt;/a&gt;, a high-level &lt;a href="https://github.com/propublica/guides/blob/master/design-structure.md"&gt;design overview&lt;/a&gt;, a &lt;a href="https://github.com/propublica/guides/blob/master/coding-manifesto.md"&gt;coding manifesto&lt;/a&gt;, our &lt;a href="https://github.com/propublica/guides/blob/master/social-tags.html"&gt;standard social tags&lt;/a&gt; and a &lt;a href="https://github.com/propublica/guides/blob/master/data-bulletproofing.md"&gt;data bulletproofing guide&lt;/a&gt;. They represent what we've learned and our best advice for designing consistent, social-optimized and impactful apps in a sane dev environment.&lt;/p&gt;

&lt;p&gt;&lt;a href="http://www.github.com/propublica/guides/"&gt;See all the guides here&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The guides were written by my colleagues &lt;a href="http://www.propublica.org/site/author/jeff_larson"&gt;Jeff Larson&lt;/a&gt; and &lt;a href="http://www.propublica.org/site/author/jennifer_lafleur"&gt;Jennifer LaFleur&lt;/a&gt; and me.&lt;/p&gt;

&lt;p&gt;We're not publishing the guides on this site. They'll live on GitHub, a site that hosts open source code repositories. On GitHub, other newsrooms are welcome to fork our guides for their own use and to recommend changes to us, either using GitHub's "issues" feature or by forking, making a change in their own form, and submitting the changes back to us as a "pull request."&lt;/p&gt;

&lt;p&gt;We mean the guides to be living documents. We'll be adding new things as we learn them, and we encourage everybody to submit what they've learned as well. Our opinions are strong but weakly held, and when we change our mind, you'll see changes in the documents.&lt;/p&gt;

&lt;p&gt;Please let us know if you find the guides useful, and make sure to submit ideas via GitHub!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/okUw8Shyh_g" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-02-27T11:06:04-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/propublicas-news-app-guides/</feedburner:origLink></item>

	<item>
		<title>RIP EveryBlock</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/D742AVS1v-U/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/rip-everyblock/#25482</guid>
		<description>&lt;style&gt;
.content-left, .sidebar-inject {display:none !important;}
&lt;/style&gt;


&lt;p&gt;With the closing of &lt;a href="http://www.holovaty.com/writing/rip-everyblock/"&gt;EveryBlock&lt;/a&gt; yesterday, both of the main reasons I started working in News Applications have disappeared from the internet. The other reason was a New York Times app called &lt;a href="http://projects.nytimes.com/represent/"&gt;Represent&lt;/a&gt;, which allowed you to keep an eye on your elected representatives. The world is worse off with these pioneering news apps gone, but retiring old apps is something that our industry has to come to terms with. At ProPublica, we&amp;#8217;ve started to retire our old apps, by removing search boxes and dynamic calls to the server. We&amp;#8217;re making every effort to make them available on the internet forever. The process is a simple one:&lt;/p&gt;

&lt;pre style="overflow-x:scroll"&gt;&lt;code&gt;wget -mkr -nH --no-parent -p --convert-links --content-disposition --adjust-extension http://url_to_app/
&lt;/code&gt;&lt;/pre&gt;

&lt;p&gt;EveryBlock and Represent opened up doors for me: Represent got me started with GIS. The &lt;a href="http://open.blogs.nytimes.com/2008/12/22/represent/"&gt;blog post&lt;/a&gt; about Represent&amp;#8217;s stack is where I learned about &lt;a href="https://docs.djangoproject.com/en/dev/ref/contrib/gis/"&gt;GeoDjango&lt;/a&gt; and &lt;a href="http://www.gdal.org/ogr2ogr.html"&gt;ogr2ogr&lt;/a&gt;, a tool I use every day. I even made my own geocoder following their lead.&lt;/p&gt;

&lt;p&gt;Everyblock inspired me to write &lt;a href="https://github.com/propublica/simple-tiles"&gt;SimpleTiles&lt;/a&gt; after reading this &lt;a href="http://alistapart.com/article/takecontrolofyourmaps"&gt;blog post&lt;/a&gt; a million times, and I still use the bar charts outlined in this &lt;a href="http://alistapart.com/article/accessibledatavisualization"&gt;article&lt;/a&gt; for almost everything.&lt;/p&gt;

&lt;p&gt;It is a shame that they are gone from the internet, and I hope we can replace them post haste.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/D742AVS1v-U" height="1" width="1"/&gt;</description>
		<dc:author>Jeff Larson</dc:author>
		<dc:subject />
		<dc:date>2013-02-08T15:47:27-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/rip-everyblock/</feedburner:origLink></item>

	<item>
		<title>A New Way to ‘Check In’ on Education Inequality</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/1jttNDg6yVw/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/a-new-way-to-check-in-on-education-inequality/#25402</guid>
		<description>&lt;p&gt;Starting today if you &lt;a href="https://projects.propublica.org/schools/foursquare"&gt;connect&lt;/a&gt; your Foursquare account to "The Opportunity Gap," we'll send you stats about schools whenever you check into one. If you've checked into a school we've associated with a Foursquare "venue," we'll show you some details and give you a link to that school's profile.&lt;/p&gt;

&lt;p&gt;You can also tap the ProPublica section of your checkin (see screenshot to right) to bring up that school's profile and compare it to nearby schools right from your smartphone.&lt;/p&gt;

&lt;img src="http://propublica.s3.amazonaws.com/projects/schools/foursquare-schools-checkin.png" style="float:right;margin-left:5px;margin-bottom:5px;"&gt;

&lt;p&gt;A year ago when we launched the first version of our "&lt;a href="http://projects.propublica.org/schools"&gt;Opportunity Gap&lt;/a&gt;" news application, we &lt;a href="http://www.propublica.org/nerds/item/facebook-for-news-apps-how-we-harnessed-the-social-network"&gt;tightly integrated Facebook&lt;/a&gt; in order to make it easy for readers to compare schools and share their school comparisons. Today's relaunch adds Foursquare, along with adding a &lt;a href="http://propublica.org/article/new-data-analysis-at-some-schools-achievement-lags-behind-opportunity"&gt;slew of new data&lt;/a&gt; to the app as well as algorithmically &lt;a href="http://propublica.org/nerds/item/how-to-edit-52000-stories-at-once"&gt;generate narratives by Narrative Science.&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;To accomplish the Foursquare integration, we're taking advantage of their new &lt;a href="https://developer.foursquare.com/overview/realtime"&gt;Real-Time API&lt;/a&gt; which lets us send push notifications in response to checkins. In order to associate schools with Foursquare venues, we used Foursquare's search API with its "match" intent -- a &lt;a href="https://developer.foursquare.com/overview/mapping"&gt;specially-designed endpoint&lt;/a&gt; for "venue harmonization" between apps. We ran our database of over 50,000 schools through Foursquare's API to store the venue IDs. If you check into a school that we haven't matched to a venue in our database, we'll use your location and the school name to show you a number of guesses as to what school you're at (we don't store your checkins or location data in our database at all). Once you pick one, we'll use your checkin data to link the school venue, so the next user that checks into that school will immediately see school stats.&lt;/p&gt;

&lt;p&gt;Many news applications are location-based, and we're excited to start experimenting with bringing our apps to users where they are.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/1jttNDg6yVw" height="1" width="1"/&gt;</description>
		<dc:author>Al Shaw</dc:author>
		<dc:subject />
		<dc:date>2013-01-24T15:19:54-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/a-new-way-to-check-in-on-education-inequality/</feedburner:origLink></item>

	<item>
		<title>How To Edit 52,000 Stories at Once</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/EGidLMw7zWs/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/how-to-edit-52000-stories-at-once/#25417</guid>
		<description>&lt;p&gt;Today we're launching an
update to our &lt;a href="http://projects.propublica.org/schools/"&gt;Opportunity Gap&lt;/a&gt; news application, with &lt;a href="http://propublica.org/article/new-data-analysis-at-some-schools-achievement-lags-behind-opportunity"&gt;two new data points&lt;/a&gt;, better
design on smartphones and integration with the &lt;a href="http://www.propublica.org/nerds/item/a-new-way-to-check-in-on-education-inequality"&gt;Foursquare
social network&lt;/a&gt;.&lt;/p&gt;


&lt;p&gt;Also launching with this
new release are short narrative descriptions of almost all of the more than 52,000
schools in our database, generated algorithmically by &lt;a href="http://www.narrativescience.com/"&gt;Narrative Science&lt;/a&gt;, a
startup based in Chicago. Narrative Science launched in 2010 out of a research
project at Northwestern University. Their platform uses artificial intelligence
to turn structured data into human-readable narratives. For example, Forbes
uses their technology to turn quarterly corporate &lt;a href="http://www.forbes.com/sites/narrativescience/2012/04/19/forbes-earnings-preview-apple-2/"&gt;earnings reports&lt;/a&gt; into short narratives.&lt;/p&gt;


&lt;p&gt;We were introduced to
Narrative Science by the MacArthur Foundation, which has supported ProPublica
since our inception in 2007. In our initial conversations with the team at
Narrative Science, we realized that ProPublica's data team and Narrative
Science share a common goal: To make data tell stories. We also had a
hypothesis, which is that adding narratives to each school page would provide an
easier way for people who learn verbally rather than visually to understand the
data. &lt;/p&gt;


&lt;p&gt;Here&amp;#8217;s how it worked, and
what you might expect if you&amp;#8217;re a news organization looking to experiment with
algorithmically generated stories. &lt;/p&gt;


&lt;p&gt;The project started with a
call with Narrative Science to talk about what was important in the data. We
told them that the &amp;#8220;nut&amp;#8221; of our news application was that all too often, school
districts and states don't distribute educational opportunities to rich and
poor kids equally. &lt;/p&gt;


&lt;p&gt;We sent Narrative
Science&amp;#8217;s engineers a raw data set and walked them through the field layout and
data gotchas.&lt;/p&gt;


&lt;p&gt;Within a few weeks we started
trading drafts &amp;#8211; about a half dozen at a time -- in the form of sample
narratives. We homed in until the narratives clearly expressed what the
often-complex data meant. In addition to making sure our data was right and how
we described it was consistent, we wanted the copy to tell the same story as
our interactive pages did &amp;#8211; a story not just about individual schools but
how each school related to other schools in the state with different poverty
levels. In that way, we worked with the engineers at
Narrative Science much as an editor and reporter do. &lt;/p&gt;


&lt;p&gt;We also had a few style
issues, nothing that wouldn&amp;#8217;t be familiar to editors and reporters everywhere. Smaller
edits ranged from applying AP style to pruning redundant clauses. &lt;/p&gt;


&lt;p&gt;One thing we learned was
that mentioning a data point in a narrative made it seem much more important
that simply including it on the page of an interactive database, so we spent
time picking the right variables to &amp;#8220;promote&amp;#8221; to the narrative. &lt;/p&gt;


&lt;p&gt;On to some of the more
technical stuff: In addition to the data that&amp;#8217;s visible on each page in the
interactive database, our data includes some numbers used behind the scenes,
such as calculated fields and an explicit pairwise relationship between similar
schools. We found that it was easier to send those data points to Narrative
Science rather than have them redo the often-complex calculations. Small
differences between the narrative description and the graphic were cropping up,
and that was the easiest way to avoid them. &lt;/p&gt;


&lt;p&gt;Unlike a normal editing process,
we weren't just working with a single story but tens of thousands of them. It
isn&amp;#8217;t practical to read 52,000 narratives. Also, Narrative Science&amp;#8217;s systems
are more complex than simple boilerplate with interpolated variables. Editing
one narrative does not mean you&amp;#8217;ve edited them all. In addition to recasting whole
paragraphs, their systems generate a variety of sentences to express the same kind
of data, so that reading the narratives for several schools would seem more
natural and not automated. So edits that made sense in one case ended up not
working in other cases, and sentences that seemed correct given one set of
circumstances seemed wrong in others &amp;#8211; often subtly. &lt;/p&gt;


&lt;p&gt;We started getting larger samples
of the generated narratives and we pulled random samples to spot check &amp;#8211;
just as we do with any data project &amp;#8211; looking for problems with agreement
between the narrative and the graphic, and for any confusing wording generated
by the algorithm. Of course, if you see any weirdness &amp;#8211; &lt;a href="mailto:scott.klein@propublica.org"&gt;let us
know&lt;/a&gt;.&lt;/p&gt;


&lt;p&gt;Ultimately, Narrative
Science delivered more than 52,000 narratives to us as json files over FTP,
which we imported into our news application. &lt;/p&gt;


&lt;p&gt;Whatever the future of the
news industry holds, it seems clear that it will involve trying lots of things.
And from our vantage point, algorithmic story generation seems like an
intriguing solution to the problem of scaling narrative journalism in the era
of big data. It&amp;#8217;s helped us tune our data journalism for a new audience, and
helped create stories at scale that would have been unthinkable otherwise. &lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/EGidLMw7zWs" height="1" width="1"/&gt;</description>
		<dc:author>Scott Klein</dc:author>
		<dc:subject />
		<dc:date>2013-01-24T15:19:01-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/how-to-edit-52000-stories-at-once/</feedburner:origLink></item>

	<item>
		<title>New Year’s Resolution: Learn to Code</title>
		<link>http://feeds.propublica.org/~r/propublica/nerds/~3/SAbLY5faiPU/</link>
		<guid isPermaLink="false">http://www.propublica.org/nerds/item/new-years-resolution-learn-to-code/#25361</guid>
		<description>&lt;style&gt;

p.post-headline {

  margin-top:0px;
  font-size:22px !important;
  padding-top:10px;
}

div.sidebar-inject {
display:none
}

li {
  list-style-type:square
}

&lt;/style&gt;


&lt;p&gt;A year ago I didn't know how to code. I had a journalism degree and had made some graphics, but I would have been hard-pressed to explain the difference between Ruby and JavaScript, and I was pretty happy when I got the YouTube video to embed correctly. I considered myself pretty technical but generally avoided the command line. &lt;/p&gt; 

&lt;p&gt;When I did start learning, I was amazed by how much was out there: introductory videos, explanatory blog posts, tips and tricks and step-by-step guides. If you're a journalist who wants to make a news app or a student interested in learning to code, you have plenty of paths to choose from.&lt;/p&gt;

&lt;p&gt;To get coding quickly, nothing beats having a project &amp;#8211; and a deadline. For some of my early projects (with the help of a few developers willing to answer my questions), I learned just enough to make it work: how to &lt;a href="http://www.propublica.org/special/what-the-frack-is-in-that-water"&gt;show/hide a div&lt;/a&gt; or &lt;a href="http://www.propublica.org/special/a-history-of-fda-inaction-on-animal-antibiotics"&gt;generate a bar chart from a CSV file&lt;/a&gt;. Repeating those same steps for later projects helped me actually understand what I was doing (and how the code worked).&lt;/p&gt;

&lt;p&gt;As the year went on I came across a number of resources that were also very helpful in digging deeper: tutorial videos that finally cleared up some long-standing befuddlement, random snippets of code that made the damn thing work after three hours of head-banging, some really good explanations I wish I had read weeks earlier, etc. I hope these links will be helpful to you as you get started coding. Most are Ruby and Rails related -- Ruby is a programming language and Rails is a "framework" that makes coding quicker and easier &amp;#8211; with some JavaScript, Git, and other stuff that came in handy over the past year.&lt;/p&gt;

&lt;p&gt;&amp;nbsp;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Ruby and Rails &lt;/strong&gt;&lt;br&gt;
&lt;p&gt;&lt;a href="https://peepcode.com/products/meet-rails-3-i"&gt;Peepcode: Meet Rails 3  Part 1&lt;/a&gt; and &lt;a href="https://peepcode.com/products/meet-rails-3-ii"&gt;Part 2&lt;/a&gt;&lt;/br&gt;
Great introductory videos that are also very convenient &amp;#8211; you can download them right onto your phone and watch them on the subway (if that&amp;#8217;s your thing). Super comprehensive, might wanna watch them a couple times to get everything.&lt;/p&gt;
&lt;p&gt;&lt;a href="http://schneems.com/ut-rails"&gt;Rails Course&lt;/a&gt; from the University of Texas&lt;/strong&gt;&lt;br&gt;
A complete set of videos from a Rails online class from a professor at the University of Texas, a bit slower-paced and more detailed than the Peepcode lectures.&lt;/p&gt;
&lt;p&gt;&lt;a href="http://guides.rubyonrails.org/"&gt;Rails Guides&lt;/a&gt;&lt;br&gt;
This is one of the best resources for both getting an introduction to different components (like routes or active record, etc) and also something to look back on every time you forget the correct syntax.&lt;/p&gt;
&lt;p&gt;&lt;a href="http://ruby.bastardsbook.com/toc/"&gt;Bastards Book of Ruby&lt;/a&gt;&lt;br&gt;
For a detailed introduction to how Ruby works, from erstwhile ProPubNerd Dan Nguyen.&lt;/li&gt;
&lt;p&gt;&lt;a href="http://ruby.railstutorial.org/ruby-on-rails-tutorial-book"&gt;Ruby on Rails Tutorial&lt;/a&gt;&lt;br&gt;
Super detailed, super comprehensive online book that walks you through the entire process of building an app from install to deploy.&lt;/p&gt;
&lt;p&gt;&lt;a href="http://railscasts.com/"&gt;RailsCasts&lt;/a&gt;&lt;br&gt;
Screencasts for learning Rails. Tips and tricks, how-to&amp;#8217;s for solving specific problems, and introductory &amp;#8220;tours&amp;#8221; of Rails techniques.&lt;/p&gt;
&lt;p&gt;&lt;a href="http://railsforzombies.org/"&gt;Rails for Zombies&lt;/a&gt;&lt;br&gt;
A pretty entertaining way to get the basics of Rails down in a real &amp;#8220;lesson&amp;#8221; format (complete with quizzes and exercises and many zombie references).&lt;/p&gt;

&lt;ul&gt;&lt;strong&gt;More Ruby and Rails Resources&lt;/strong&gt;&lt;br&gt;
&lt;li&gt;&lt;a href="http://www.ruby-lang.org/en/"&gt;Ruby Programming Language official page&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://rubyonrails.org/screencasts"&gt;Screencasts from the Rails official page&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://ruby-doc.org/docs/ProgrammingRuby/html/index.html"&gt;Programming Ruby: The Pragmatic Programmer&amp;#8217;s Guide&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://rubykoans.com/"&gt;Learn Ruby with the Neo Ruby Koans&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://pine.fm/LearnToProgram/"&gt;Learn to Program (ruby)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://api.rubyonrails.org/"&gt;Rails API&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://rubylearning.com/satishtalim/tutorial.html"&gt;Ruby Tutorial&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://jasonseifer.com/2010/04/06/rake-tutorial"&gt;Rake&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://yaronwalfish.com/ruby-loop-the-ruby-way/"&gt;Loops&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://paulsturgess.co.uk/articles/49-using-helper-methods-in-ruby-on-rails"&gt;Helpers&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://paulsturgess.co.uk/articles/28-how-to-assign-a-css-class-or-id-to-link_to-method-in-ruby-on-rails"&gt;Link_to and css classes&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.skorks.com/2009/08/how-a-ruby-case-statement-works-and-what-you-can-do-with-it/"&gt;Case Statements in Ruby&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.foragoodstrftime.com/"&gt;Strftime&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://tomafro.net/2009/08/using-indexes-in-rails-index-your-associations"&gt;Indexes in Rails&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;ul&gt;&lt;strong&gt;Javascript&lt;/strong&gt;&lt;br /&gt;
  &lt;li&gt;&lt;a href="http://eloquentjavascript.net/"&gt;Eloquent JavaScript: A Modern Introduction to Programming&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="https://peepcode.com/products/jquery"&gt;Peepcode: Meet jQuery&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://www.codecademy.com/tracks/javascript-combined "&gt;Codecademy: JavaScript&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://www.amazon.com/JavaScript-Good-Parts-Douglas-Crockford/dp/0596517742"&gt;JavaScript: The Good Parts&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Command Line&lt;/strong&gt;&lt;br /&gt;
  &lt;li&gt;&lt;a href="https://peepcode.com/products/meet-the-command-line"&gt;Meet the Command Line&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://learncodethehardway.org/cli/book/cli-crash-course.html"&gt;Zed Shaw&amp;#8217;s The Command Line Crash Course&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Sinatra:&lt;/strong&gt; a lighter framework than Rails&lt;br /&gt;
  &lt;li&gt;&lt;a href="http://www.alistapart.com/articles/rapid-prototyping-with-sinatra/"&gt;Rapid Prototyping with Sinatra&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Git:&lt;/strong&gt; a source code management system that keeps track of versions and aids collaboration. &lt;br /&gt;
&lt;li&gt;&lt;a href="http://www.alistapart.com/articles/get-started-with-git/"&gt;Get Started with Git&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://git-scm.com/book/en/"&gt;Pro Git Book&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;CSS&lt;/strong&gt;&lt;br /&gt;
  &lt;li&gt;&lt;a href="http://css-tricks.com/examples/ShapesOfCSS/"&gt;Shapes of CSS&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://net.tutsplus.com/tutorials/html-css-techniques/the-30-css-selectors-you-must-memorize/"&gt;The 30 CSS Selectors you Must Memorize&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://www.designmeme.com/tutorials/csscurlyquotes/"&gt;CSS Curly Quotes&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://nicolasgallagher.com/pure-css-gui-icons/demo/#non"&gt;Pure CSS GUI Icons&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://css-tricks.com/examples/nth-child-tester/"&gt;CSS Nth Child Tester&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://www.stuffandnonsense.co.uk/archives/images/specificitywars-05v2.jpg"&gt;CSS Specificity, Star Wars edition&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://css-tricks.com/pseudo-class-selectors/"&gt;Pseudo Class Selectors&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Colors&lt;/strong&gt;&lt;br /&gt;
  &lt;li&gt;&lt;a href="http://colororacle.cartography.ch/"&gt;Color Oracle&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://colorbrewer2.org/"&gt;Color Brewer&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://0to255.com/"&gt;0to255&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://www.colorzilla.com/gradient-editor/"&gt;CSS Color Gradient Tool&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Regular Expressions:&lt;/strong&gt; used to match patterns of text for powerful find-and-replace, data cleaning, formatting, etc. &lt;br /&gt;
  &lt;li&gt;&lt;a href="http://rubular.com/"&gt;Ruby regular expression editor&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://ruby.bastardsbook.com/chapters/regexes/"&gt;Regular Expressions&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;a href="http://regex.learncodethehardway.org/book/"&gt;Learn Regex The Hard Way&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Sublime Text&lt;/strong&gt;: a graphical text editor&lt;br /&gt;
&lt;li&gt;&lt;a href="http://net.tutsplus.com/tutorials/tools-and-tips/sublime-text-2-tips-and-tricks/"&gt;Sublime Text Tips and Tricks&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;&lt;strong&gt;Vim&lt;/strong&gt;: a text editor for the command line&lt;br /&gt;
&lt;li&gt;&lt;a href="http://linux.die.net/Intro-Linux/sect_06_02.html"&gt;Using the Vim Editor&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;ul&gt;
&lt;strong&gt;Markdown&lt;/strong&gt;: a tool for writing plain text that's easily converted to HTML &lt;br /&gt;
&lt;li&gt;&lt;a href="http://daringfireball.net/projects/markdown/"&gt;Markdown Basics&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;&lt;img src="http://feeds.feedburner.com/~r/propublica/nerds/~4/SAbLY5faiPU" height="1" width="1"/&gt;</description>
		<dc:author>Lena Groeger</dc:author>
		<dc:subject />
		<dc:date>2012-12-19T14:51:14-05:00</dc:date>
    <feedburner:origLink>http://www.propublica.org/nerds/item/new-years-resolution-learn-to-code/</feedburner:origLink></item>

    
    </channel>
</rss>
