您的位置：首页 > 编程语言 > Go语言

使用Google接口实时翻译

2015-12-03 15:08 489 查看

在项目当中，获得到的数据是英文，如果想翻译为中文内容，可直接调用Google接口： https://translate.google.com.hk/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&source=btn&ssel=0&tsel=0&kc=0&tk=470115|78768&q=%22world%22
当然，Google服务需要翻墙，所以要用到代理，这里代理的获取就暂且不细说（可以代理商购买）。下面主要提供java程序与配置文件参考：

TranslateUtil.java ：

package com.ttz.crawl.util;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import com.ttz.crawl.common.Page;
import com.ttz.crawl.config.CrawlConfig;
import com.ttz.crawl.fetch.FetchRet;
import com.ttz.crawl.fetch.HttpClientFetch;
import com.ttz.crawl.proxy.ProxyPoolMan;

/**
* 调用google 翻译接口
* @author zhaoyuchun
*/
public class TranslateUtil
{
public static Pattern cntValidPatt = null;
public static Pattern cntCHSPatt = null;
public static HttpClientFetch fetcher = null;
public static String translateUrl = null;
public static Logger log = Logger.getLogger(TranslateUtil.class);

static
{
fetcher = new HttpClientFetch();
cntValidPatt = Pattern.compile("\\[\\[\\[.*?\\]\\]\\]");
cntCHSPatt = Pattern.compile("\\[\\[\\[\"“(.*?)”\",");
translateUrl = CrawlConfig.getParam("translateUrl");
}

public static String translate(String str)
{
ProxyPoolMan.enable = true;
String [] listUrlPattern = translateUrl.split("\\*");
try
{
String url = listUrlPattern[0]
+ URLEncoder.encode(listUrlPattern[1],"utf-8")
+ listUrlPattern[2]
+ URLEncoder.encode(str,"utf-8")
+ listUrlPattern[3];
FetchRet ret = fetcher.getPageRet(url, cntValidPatt, null, null);
if(ret == null || ret.page == null)
{
log.error("can not get the translatePage. url:" + url);
return null;
}
Page page = ret.page;
//来处理得到的字符串
str = PageExtractUtil.extractPageLabel(page, cntCHSPatt);
}
catch (Exception e)
{
log.error(e);
str = null;
}
finally
{
ProxyPoolMan.enable = false;
}
return str;
}

public static void main(String args[]) throws UnsupportedEncodingException
{
translate("proxy");
}
}

StockTwits.properties:

#common
home_path = C:\\Users\\zhaoyuchun\\workspace\\91z_2014
db_conf = res/db.properties
log_conf = res/log4j.properties
tair_conf = res/tair.properties
redis_conf = res/redis.properties
enable_encode_to_utf8 = true
socket_port = 50010
site_id = 3
data_name = StockTwits

#以分割不同的url
base_url = http://stocktwits.com/streams/poll?stream=symbol&max=*&stream_id=*&substream=*&item_id=*http://stocktwits.com/symbol/http://stocktwits.com/ 
fetch_socket_timeout = 30000
fetch_connect_timeout = 10000
#unit: second
sleeptime = 1
#unit: minute
intervals = 60
#stop condition:0--crawl all; 1-- < page_size; 2-- begin_date <= pub_date < end_date; 3--when crawl crawled page; 4--only crawl uncrawled url, only compare url in dedup db
#if thread_no > 1, stop_type 可以有多个值,对应不同的thread,以;分割, e.g. 0;4
stop_type = 0
stock_type = 2
#set value when stop_type == 1
page_size = 1
#set value when stop_type == 2
#date format:yyyy-mm-dd
crawl_begin_date = 2012-02-20
crawl_end_date = 2012-02-21

#cookie setting
enable_cookie = false
#当内存中存储的失败url超过max_capacity之后，会写入文件
max_url_capacity = 1000
max_page_size = 5000

#regex
regex_stream_id = 'streamId' : '(\\d+)'\,
regex_url_id = max_id: (\\d+)\,.*?poll_id: '(\\d+)'\,.*?substream:\\s'(.*?)'\,
regex_list =<li data-src=.*?id.*?(\\d+).*?created_at":"(.*?\\d+.*?) -.*?user_path":"\\\\\\/(.*?)"\,"avatar_url":"(.*?)"[\\s\\S]*?body.*?;:\\\\\\?"([\\s\\S]*?);links[\\s\\S]*?total_likes":(\\d+?)\,(?:[^<]*?replies":(\\d+)\,)?
#proxy
enable_proxy = true
check_file = 10000000
proxy_file = data/proxy.xml
proxy_crawl_properties = res/proxy.properties
#the upper-bound pages to fetch from a host in a WATCH_INTERVAL
max_per_interval = 6000
#fetch density control interval, "ms" as unit
watch_interval = 60000
#about a proxy:in BLOCK_FORBIDEN_TIME ,BLOCKVALUE urls are not fetched,then the proxy is invalid for the host,delete the proxy
#and when the proxy is delete,delte the current url;
block_forbidden_item = 120000
block_span = 3600000
#about a host:in FORBIDEN_PERIOD ,FORBIDEN_COUNT urls are not fetched,the host is invalid,then delete all the host urls in FORBIDEN_PERIOD
failure_max_count = 2
#forbidden_period = 1000000
failure_watch_interval = 1000000
failure_clear_span = 3600000
proxy_pool_min_size = 500
pool_update_span = 3600000

block_span = 3600000
#about a host:in FORBIDEN_PERIOD ,FORBIDEN_COUNT urls are not fetched,the host is invalid,then delete all the host urls in FORBIDEN_PERIOD
#forbidden_count = 10
failure_value = 2
#forbidden_period = 1000000
failure_watch_interval = 1000000
failure_clear_span = 3600000
proxy_pool_minsize = 10
pool_update_span = 3600000

#dedup
enable_dedup = true

#writer
enable_write = true
save_page_path = ${home_path}/data/page
max_frequent = 1800000
max_per_queue = 5000
min_page_size = 10

#tair setting
space_name = 1000
redis_key  = EXTRACT0000

#thread setting
thread_numbers=1
thread_no=1

#host setting
machine_id=1

proxy_url = http://www.yun-daili.com/api.asp?key=20151109200040011100200069397641&getnum=200&area=2&proxytype=1 translateUrl = https://translate.google.com.hk/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&source=btn&ssel=0&tsel=0&kc=0&tk=470115*|*78768&q=%22*%22

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 谷歌 java 翻译接口

相关文章推荐

新的分享

章节导航