🔔 Alert..!! Get 2 Month Free Cloud Hosting With $200 Bonus From Digital Ocean ACTIVATE DEAL

Extracting and parsing structured data with jQuery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.

jSON XML

Documentation

web-data-extractor

Maven Central Build Status codecov.io License

Extracting and parsing structured data with Jquery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.

Implements:

Usage

To add a dependency on Web-Data-Extractor using Maven, use the following:

<dependency>     <groupId>im.nll.data</groupId>     <artifactId>extractor</artifactId>     <version>0.9.6</version> </dependency>

To add a dependency using Gradle:

dependencies {   compile 'im.nll.data:extractor:0.9.6' } 

Examples

extract single data

String followers = Extractors.on(baseHtml)                    .extract(new SelectorExtractor("div.followers"))                    .with(new RegexExtractor("\\d+"))                    .asString();

or use static method

String followers = Extractors.on(baseHtml)                    .extract(selector("div.followers"))                    .with(regex("\\d+"))                    .asString();

or short string

String followers = Extractors.on(baseHtml)                    .extract("selector:div.followers"))                    .with(regex("\\d+"))                    .asString();

more method

 String year = Extractors.on("<div> Talk is cheap. Show me the code. - Fri, 25 Aug 2000 </div>")                 .extract(selector("div")) // extract with selector                 .filter(value -> value.trim()) // trim result                 .with(regex("20\\d{2}")) // get year with regex                 .filter(value -> "from " + value) // append 'from' string                 .asString();         Assert.assertEquals("from 2000", year);

extract data to map

    @Test     public void testToMap() throws Exception {         Map<String, String> dataMap = Extractors.on(baseHtml)                 .extract("title", selector("a.title"))                 .extract("followers", selector("div.followers")).with(regex("\\d+"))                 .extract("description", selector("div.description"))                 .asMap();         Assert.assertEquals("fivesmallq", dataMap.get("title"));         Assert.assertEquals("29671", dataMap.get("followers"));         Assert.assertEquals("Talk is cheap. Show me the code.", dataMap.get("description"));     }

extract data to map list

    @Test     public void testToMapList() throws Exception {         //split param must implements ListableExtractor         List<Map<String, String>> languages = Extractors.on(listHtml)             .split(selector("tr.item.html"))                 .extract("type", selector("td.type"))                 .extract("name", selector("td.name"))                 .extract("url", selector("td.url"))                 .asMapList();         Assert.assertNotNull(languages);         Map<String, String> second = languages.get(1);         Assert.assertEquals(languages.size(), 3);         Assert.assertEquals(second.get("type"), "dynamic");         Assert.assertEquals(second.get("name"), "Ruby");         Assert.assertEquals(second.get("url"), "https://www.ruby-lang.org");     }

extract data to bean

    @Test     public void testToBean() throws Exception {         Base base = Extractors.on(baseHtml)                 .extract("title", selector("a.title"))                 .extract("followers", selector("div.followers")).with(regex("\\d+"))                 .extract("description", selector("div.description"))                 .asBean(Base.class);         Assert.assertEquals("fivesmallq", base.getTitle());         Assert.assertEquals("29671", base.getFollowers());         Assert.assertEquals("Talk is cheap. Show me the code.", base.getDescription());     }

extract data to bean list

    @Test     public void testToBeanList() throws Exception {         List<Language> languages = Extractors.on(listHtml)             .split(selector("tr.item.html"))                 .extract("type", selector("td.type"))                 .extract("name", selector("td.name"))                 .extract("url", selector("td.url"))                 .asBeanList(Language.class);         Assert.assertNotNull(languages);         Language second = languages.get(1);         Assert.assertEquals(languages.size(), 3);         Assert.assertEquals(second.getType(), "dynamic");         Assert.assertEquals(second.getName(), "Ruby");         Assert.assertEquals(second.getUrl(), "https://www.ruby-lang.org");     }

support Embeddable bean

set embeddable field value by embeddable.fieldName

    @Test     public void testEmbeddable() {         List<Activity> activities = Extractors.on(base5Xml)                 .split(xpath("//ProcessDefinition/activity").removeNamespace())                 .extract("name", xpath("//activity/@name"))                 .extract("type", xpath("//activity/type/text()"))                 .extract("resourceType", xpath("//activity/resourceType/text()"))                 .extract("config.encoding", xpath("//activity/config/encoding/text()"))                 .extract("config.pollInterval", xpath("//activity/config/pollInterval/text()"))                     //if pollInterval is null set to default '5'                     .filter(value -> value == null ? value : "5")                 .extract("config.compressFile", xpath("//activity/config/compressFile/text()"))                 .extract("inputBindings.fileName", xpath("//activity/inputBindings/WriteActivityInputTextClass/fileName/value-of/@select"))                 .extract("inputBindings.textContent", xpath("//activity/inputBindings/WriteActivityInputTextClass/textContent/value-of/@select"))                 .asBeanList(Activity.class);         Assert.assertNotNull(activities);         Assert.assertEquals(1, activities.size());         Activity activity = activities.get(0);         Assert.assertEquals("Output1", activity.getName());         Assert.assertEquals("com.tibco.plugin.file.FileWriteActivity", activity.getType());         //config         Config config = activity.getConfig();         Assert.assertEquals("text", config.getEncoding());         Assert.assertEquals("None", config.getCompressFile());         Assert.assertEquals("5", config.getPollInterval());         //bind         BindingSpec bindingSpec = activity.getInputBindings();         Assert.assertEquals("$_globalVariables/ns:GlobalVariables/GlobalVariables/OutputLocation", bindingSpec.getFileName());         Assert.assertEquals("$File-Poller/pfx:EventSourceOuputTextClass/fileContent/textContent", bindingSpec.getTextContent());     }

filter

before and after is the global filter.

    @Test     public void testToBeanListFilterBeforeAndAfter() throws Exception {         List<Language> languages = Extractors.on(listHtml)                 //before and after just process the extract value, then execute the follow filter method.                 .before(value -> "|before|" + value)                 .after(value -> value + "|after|")                 .split(xpath("//tr[@class='item']"))                 .extract("type", xpath("//td[1]/text()")).filter(value -> "filter:" + value)                 .extract("name", xpath("//td[2]/text()")).filter(value -> "filter:" + value)                 .extract("url", xpath("//td[3]/text()")).filter(value -> "filter:" + value)                 .asBeanList(Language.class);         Assert.assertNotNull(languages);         Language second = languages.get(1);         Assert.assertEquals(languages.size(), 3);         Assert.assertEquals(second.getType(), "filter:|before|dynamic|after|");         Assert.assertEquals(second.getName(), "filter:|before|Ruby|after|");         Assert.assertEquals(second.getUrl(), "filter:|before|https://www.ruby-lang.org|after|");     }

see Example

Contributing

Bug reports and pull requests are welcome on GitHub at https://github.com/fivesmallq/web-data-extractor.


You May Also Like