package collection_map;
import java.io.BufferedReader;
import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Test { private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式 private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式 private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 private static final String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符 public static String delHTMLTag(String htmlStr) { Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 过滤script标签 Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 过滤style标签 Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 过滤html标签 Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE); Matcher m_space = p_space.matcher(htmlStr); htmlStr = m_space.replaceAll(""); // 过滤空格回车标签 return htmlStr.trim(); // 返回文本字符串 } public static String getTextFromHtml(String htmlStr){ htmlStr = delHTMLTag(htmlStr); htmlStr = htmlStr.replaceAll(" ", ""); htmlStr = htmlStr.substring(0, htmlStr.indexOf("。")+1); return htmlStr; } public static void main(String[] args) { // String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式 // String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式 // String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 // String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符 URL url; try { // get URL content url = new URL("http://ssl.gongyi.qq.com/m/weixin/detail_yqj_commentList.html?pg=2&did=1215372601201609087100023344&oid=oproJj0dWhq4R_jCp3iYZgb3cbPY");//目标URL URLConnection conn = url.openConnection();//打开URL // open the stream and put it into BufferedReader BufferedReader br = new BufferedReader( new InputStreamReader(conn.getInputStream(),"utf-8")); String inputLine; //save to this filename String fileName = "test.txt";//建立URL File file = new File(fileName); if (!file.exists()) { file.createNewFile(); } //use FileWriter to write file FileWriter fw = new FileWriter(file.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); while ((inputLine = br.readLine()) != null) { // bw.write(inputLine.replaceAll("[^(0-9\\u4e00-\\u9fa5)]", "")); String str = getTextFromHtml(inputLine); bw.write(str); } bw.close(); br.close(); System.out.println("Done"); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }}